[llvm] [RISCV] Promote i8/i16/i32 scalable vector CLMUL to i64 CLMUL with Zvbc. (PR #184265)

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 2 21:51:48 PST 2026


https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/184265

>From 600f2a3e18c14a35f997871466f43c569306b9a1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 2 Mar 2026 15:03:30 -0800
Subject: [PATCH 1/3] [LegalizeVectorOps][RISCV][PowerPC][AArch64][X86] Enable
 the clmul/clmulr/clmulh expansion code.

These instructions weren't added to the master switch statement
that determines if they should be considered vector ops.
---
 .../SelectionDAG/LegalizeVectorOps.cpp        |     3 +
 llvm/test/CodeGen/AArch64/clmul-fixed.ll      |  2356 +-
 llvm/test/CodeGen/AArch64/clmul-scalable.ll   |   633 +-
 llvm/test/CodeGen/PowerPC/clmul-vector.ll     |  6404 +-
 llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll   |  1986 +-
 llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll  | 81987 +++++++---------
 .../CodeGen/RISCV/rvv/fixed-vectors-clmul.ll  |   768 +-
 llvm/test/CodeGen/X86/clmul-vector-256.ll     |  1743 +-
 llvm/test/CodeGen/X86/clmul-vector-512.ll     |  2200 +-
 llvm/test/CodeGen/X86/clmul-vector.ll         |  5451 +-
 10 files changed, 46728 insertions(+), 56803 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 0b1d5bfd078d8..74fe5c5819982 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -389,6 +389,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTPOP:
+  case ISD::CLMUL:
+  case ISD::CLMULH:
+  case ISD::CLMULR:
   case ISD::SELECT:
   case ISD::VSELECT:
   case ISD::SELECT_CC:
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index 8205d6c80221d..23692dc456fc2 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -1730,23 +1730,23 @@ define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
 define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: clmul_v8i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v3.8h, #1
-; CHECK-NEXT:    movi v4.8h, #4
-; CHECK-NEXT:    movi v5.8h, #8
-; CHECK-NEXT:    movi v6.8h, #16
-; CHECK-NEXT:    movi v7.8h, #32
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    movi v16.8h, #64
-; CHECK-NEXT:    movi v17.8h, #128
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    mov v3.16b, v1.16b
+; CHECK-NEXT:    mov v4.16b, v1.16b
+; CHECK-NEXT:    mov v5.16b, v1.16b
+; CHECK-NEXT:    mov v6.16b, v1.16b
+; CHECK-NEXT:    mov v7.16b, v1.16b
+; CHECK-NEXT:    mov v16.16b, v1.16b
+; CHECK-NEXT:    bic v1.8h, #127
+; CHECK-NEXT:    bic v2.8h, #253
+; CHECK-NEXT:    bic v3.8h, #254
+; CHECK-NEXT:    bic v4.8h, #251
+; CHECK-NEXT:    bic v5.8h, #247
+; CHECK-NEXT:    bic v6.8h, #239
+; CHECK-NEXT:    bic v7.8h, #223
+; CHECK-NEXT:    bic v16.8h, #191
+; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    xtn v2.8b, v2.8h
 ; CHECK-NEXT:    xtn v3.8b, v3.8h
 ; CHECK-NEXT:    xtn v4.8b, v4.8h
@@ -1754,7 +1754,6 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-NEXT:    xtn v6.8b, v6.8h
 ; CHECK-NEXT:    xtn v7.8b, v7.8h
 ; CHECK-NEXT:    xtn v16.8b, v16.8h
-; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    umull v2.8h, v0.8b, v2.8b
 ; CHECK-NEXT:    umull v3.8h, v0.8b, v3.8b
 ; CHECK-NEXT:    umull v4.8h, v0.8b, v4.8b
@@ -1780,89 +1779,84 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: clmul_v16i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v4.8h, #2
-; CHECK-NEXT:    ushll v2.8h, v1.8b, #0
-; CHECK-NEXT:    movi v5.8h, #1
-; CHECK-NEXT:    movi v6.8h, #4
-; CHECK-NEXT:    movi v7.8h, #8
-; CHECK-NEXT:    movi v17.8h, #16
-; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    movi v18.8h, #32
-; CHECK-NEXT:    movi v1.8h, #128
-; CHECK-NEXT:    movi v19.8h, #64
-; CHECK-NEXT:    movi v25.2d, #0000000000000000
-; CHECK-NEXT:    and v16.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v20.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v21.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v22.16b, v2.16b, v7.16b
-; CHECK-NEXT:    and v4.16b, v3.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v3.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v3.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v3.16b, v7.16b
-; CHECK-NEXT:    and v23.16b, v3.16b, v17.16b
-; CHECK-NEXT:    and v24.16b, v3.16b, v18.16b
-; CHECK-NEXT:    and v26.16b, v3.16b, v1.16b
-; CHECK-NEXT:    and v17.16b, v2.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v2.16b, v18.16b
+; CHECK-NEXT:    ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    mov v4.16b, v2.16b
+; CHECK-NEXT:    mov v5.16b, v2.16b
+; CHECK-NEXT:    mov v6.16b, v2.16b
+; CHECK-NEXT:    mov v7.16b, v2.16b
+; CHECK-NEXT:    mov v16.16b, v2.16b
+; CHECK-NEXT:    mov v17.16b, v2.16b
+; CHECK-NEXT:    mov v18.16b, v1.16b
+; CHECK-NEXT:    mov v19.16b, v1.16b
+; CHECK-NEXT:    mov v20.16b, v1.16b
+; CHECK-NEXT:    mov v21.16b, v1.16b
+; CHECK-NEXT:    mov v22.16b, v1.16b
+; CHECK-NEXT:    mov v23.16b, v1.16b
+; CHECK-NEXT:    bic v4.8h, #253
+; CHECK-NEXT:    bic v5.8h, #254
+; CHECK-NEXT:    bic v6.8h, #251
+; CHECK-NEXT:    bic v7.8h, #247
+; CHECK-NEXT:    mov v3.16b, v2.16b
+; CHECK-NEXT:    bic v16.8h, #239
+; CHECK-NEXT:    bic v17.8h, #223
+; CHECK-NEXT:    bic v18.8h, #253
+; CHECK-NEXT:    bic v19.8h, #254
+; CHECK-NEXT:    bic v20.8h, #251
+; CHECK-NEXT:    bic v21.8h, #247
+; CHECK-NEXT:    bic v22.8h, #239
+; CHECK-NEXT:    bic v23.8h, #223
+; CHECK-NEXT:    mov v24.16b, v1.16b
 ; CHECK-NEXT:    uzp1 v4.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    uzp1 v5.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    uzp1 v6.16b, v0.16b, v6.16b
 ; CHECK-NEXT:    uzp1 v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    uzp1 v23.16b, v0.16b, v23.16b
-; CHECK-NEXT:    uzp1 v24.16b, v0.16b, v24.16b
-; CHECK-NEXT:    and v3.16b, v3.16b, v19.16b
-; CHECK-NEXT:    uzp1 v26.16b, v0.16b, v26.16b
-; CHECK-NEXT:    uzp1 v25.16b, v0.16b, v25.16b
-; CHECK-NEXT:    xtn v16.8b, v16.8h
+; CHECK-NEXT:    bic v3.8h, #191
+; CHECK-NEXT:    uzp1 v16.16b, v0.16b, v16.16b
+; CHECK-NEXT:    uzp1 v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    xtn v18.8b, v18.8h
+; CHECK-NEXT:    xtn v19.8b, v19.8h
 ; CHECK-NEXT:    xtn v20.8b, v20.8h
 ; CHECK-NEXT:    xtn v21.8b, v21.8h
 ; CHECK-NEXT:    xtn v22.8b, v22.8h
-; CHECK-NEXT:    xtn v17.8b, v17.8h
-; CHECK-NEXT:    xtn v18.8b, v18.8h
-; CHECK-NEXT:    and v19.16b, v2.16b, v19.16b
-; CHECK-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    xtn v23.8b, v23.8h
+; CHECK-NEXT:    bic v24.8h, #191
 ; CHECK-NEXT:    umull2 v4.8h, v0.16b, v4.16b
 ; CHECK-NEXT:    umull2 v5.8h, v0.16b, v5.16b
 ; CHECK-NEXT:    umull2 v6.8h, v0.16b, v6.16b
 ; CHECK-NEXT:    umull2 v7.8h, v0.16b, v7.16b
-; CHECK-NEXT:    umull2 v23.8h, v0.16b, v23.16b
-; CHECK-NEXT:    umull2 v24.8h, v0.16b, v24.16b
-; CHECK-NEXT:    umull2 v26.8h, v0.16b, v26.16b
-; CHECK-NEXT:    umull2 v25.8h, v0.16b, v25.16b
-; CHECK-NEXT:    xtn v19.8b, v19.8h
-; CHECK-NEXT:    umull v16.8h, v0.8b, v16.8b
+; CHECK-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    umull2 v16.8h, v0.16b, v16.16b
+; CHECK-NEXT:    umull2 v17.8h, v0.16b, v17.16b
+; CHECK-NEXT:    umull v18.8h, v0.8b, v18.8b
+; CHECK-NEXT:    xtn v24.8b, v24.8h
+; CHECK-NEXT:    umull v19.8h, v0.8b, v19.8b
 ; CHECK-NEXT:    umull v20.8h, v0.8b, v20.8b
 ; CHECK-NEXT:    umull v21.8h, v0.8b, v21.8b
 ; CHECK-NEXT:    umull v22.8h, v0.8b, v22.8b
-; CHECK-NEXT:    umull v17.8h, v0.8b, v17.8b
-; CHECK-NEXT:    umull v18.8h, v0.8b, v18.8b
-; CHECK-NEXT:    umull2 v3.8h, v0.16b, v3.16b
+; CHECK-NEXT:    umull v23.8h, v0.8b, v23.8b
+; CHECK-NEXT:    bic v2.8h, #127
+; CHECK-NEXT:    bic v1.8h, #127
 ; CHECK-NEXT:    eor v4.16b, v5.16b, v4.16b
 ; CHECK-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v23.16b, v24.16b
-; CHECK-NEXT:    eor v7.16b, v26.16b, v25.16b
-; CHECK-NEXT:    eor v23.16b, v25.16b, v25.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    umull v2.8h, v0.8b, v19.8b
-; CHECK-NEXT:    eor v16.16b, v20.16b, v16.16b
-; CHECK-NEXT:    eor v19.16b, v21.16b, v22.16b
-; CHECK-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEXT:    umull2 v3.8h, v0.16b, v3.16b
+; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    umull v7.8h, v0.8b, v24.8b
+; CHECK-NEXT:    eor v16.16b, v19.16b, v18.16b
+; CHECK-NEXT:    eor v17.16b, v20.16b, v21.16b
+; CHECK-NEXT:    eor v18.16b, v22.16b, v23.16b
+; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
 ; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v7.16b, v25.16b
-; CHECK-NEXT:    eor v6.16b, v23.16b, v25.16b
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    eor v7.16b, v16.16b, v19.16b
-; CHECK-NEXT:    eor v2.16b, v17.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v25.16b
-; CHECK-NEXT:    eor v5.16b, v6.16b, v25.16b
+; CHECK-NEXT:    eor v5.16b, v16.16b, v17.16b
+; CHECK-NEXT:    eor v6.16b, v18.16b, v7.16b
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v2.16b
 ; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    eor v1.16b, v7.16b, v2.16b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v5.16b, v25.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v1.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v5.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <16 x i8> %x to <16 x i16>
   %zexty = zext <16 x i8> %y to <16 x i16>
@@ -1963,168 +1957,162 @@ define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
 define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: clmul_v8i32_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    str d12, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset b8, -8
 ; CHECK-NEXT:    .cfi_offset b9, -16
-; CHECK-NEXT:    movi v4.4s, #2
-; CHECK-NEXT:    movi v5.4s, #1
-; CHECK-NEXT:    movi v6.4s, #4
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    movi v3.4s, #8
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    movi v20.4s, #16
-; CHECK-NEXT:    movi v21.4s, #32
-; CHECK-NEXT:    and v17.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v7.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v16.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v24.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    xtn v19.4h, v16.4s
-; CHECK-NEXT:    uzp1 v4.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uzp1 v22.8h, v0.8h, v5.8h
-; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v6.8h
-; CHECK-NEXT:    uzp1 v18.8h, v0.8h, v18.8h
-; CHECK-NEXT:    and v25.16b, v1.16b, v21.16b
-; CHECK-NEXT:    movi v6.4s, #128
-; CHECK-NEXT:    uzp1 v24.8h, v0.8h, v24.8h
-; CHECK-NEXT:    and v28.16b, v2.16b, v20.16b
-; CHECK-NEXT:    and v21.16b, v2.16b, v21.16b
-; CHECK-NEXT:    umull v5.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v19.4h
-; CHECK-NEXT:    xtn v19.4h, v3.4s
+; CHECK-NEXT:    .cfi_offset b10, -24
+; CHECK-NEXT:    .cfi_offset b11, -32
+; CHECK-NEXT:    .cfi_offset b12, -48
+; CHECK-NEXT:    movi v19.4s, #2
+; CHECK-NEXT:    movi v21.4s, #1
+; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-NEXT:    movi v17.4s, #4
+; CHECK-NEXT:    movi v20.4s, #8
+; CHECK-NEXT:    movi v5.4s, #16
+; CHECK-NEXT:    movi v4.4s, #32
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    and v3.16b, v2.16b, v19.16b
+; CHECK-NEXT:    and v6.16b, v2.16b, v21.16b
+; CHECK-NEXT:    and v7.16b, v2.16b, v17.16b
+; CHECK-NEXT:    and v16.16b, v2.16b, v20.16b
+; CHECK-NEXT:    and v18.16b, v2.16b, v5.16b
+; CHECK-NEXT:    and v22.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    movi v3.4s, #64
+; CHECK-NEXT:    uzp1 v24.8h, v0.8h, v6.8h
+; CHECK-NEXT:    movi v6.4s, #128
+; CHECK-NEXT:    uzp1 v25.8h, v0.8h, v7.8h
 ; CHECK-NEXT:    movi v7.4s, #1, lsl #8
-; CHECK-NEXT:    umull2 v26.4s, v0.8h, v4.8h
-; CHECK-NEXT:    umull2 v22.4s, v0.8h, v22.8h
-; CHECK-NEXT:    umull2 v23.4s, v0.8h, v23.8h
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v18.8h
-; CHECK-NEXT:    uzp1 v25.8h, v0.8h, v25.8h
-; CHECK-NEXT:    movi v4.4s, #2, lsl #8
-; CHECK-NEXT:    and v30.16b, v1.16b, v6.16b
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v20.4s, #16, lsl #8
-; CHECK-NEXT:    and v29.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v31.16b, v1.16b, v7.16b
+; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v16.8h
+; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v18.8h
+; CHECK-NEXT:    uzp1 v28.8h, v0.8h, v22.8h
+; CHECK-NEXT:    movi v16.4s, #8, lsl #8
+; CHECK-NEXT:    movi v18.4s, #16, lsl #8
+; CHECK-NEXT:    movi v22.4s, #2, lsl #8
+; CHECK-NEXT:    umull2 v29.4s, v0.8h, v23.8h
+; CHECK-NEXT:    and v23.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    umull2 v24.4s, v0.8h, v24.8h
-; CHECK-NEXT:    eor v22.16b, v22.16b, v26.16b
-; CHECK-NEXT:    xtn v28.4h, v28.4s
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v23.16b, v23.16b, v27.16b
+; CHECK-NEXT:    and v30.16b, v2.16b, v6.16b
+; CHECK-NEXT:    and v31.16b, v2.16b, v7.16b
 ; CHECK-NEXT:    umull2 v25.4s, v0.8h, v25.8h
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v30.8h
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v29.8h
-; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v31.8h
-; CHECK-NEXT:    and v30.16b, v1.16b, v4.16b
-; CHECK-NEXT:    xtn v31.4h, v21.4s
-; CHECK-NEXT:    movi v21.4s, #32, lsl #8
-; CHECK-NEXT:    and v8.16b, v1.16b, v20.16b
-; CHECK-NEXT:    eor v22.16b, v22.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v18.16b
-; CHECK-NEXT:    umull v28.4s, v0.4h, v28.4h
-; CHECK-NEXT:    eor v24.16b, v24.16b, v25.16b
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v27.8h
-; CHECK-NEXT:    eor v16.16b, v16.16b, v5.16b
-; CHECK-NEXT:    umull2 v25.4s, v0.8h, v26.8h
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v30.8h
-; CHECK-NEXT:    umull2 v29.4s, v0.8h, v29.8h
-; CHECK-NEXT:    movi v30.2d, #0000000000000000
-; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v23.8h
-; CHECK-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
-; CHECK-NEXT:    and v9.16b, v1.16b, v21.16b
-; CHECK-NEXT:    umull v31.4s, v0.4h, v31.4h
-; CHECK-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEXT:    and v6.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v2.16b, v7.16b
-; CHECK-NEXT:    and v18.16b, v2.16b, v18.16b
 ; CHECK-NEXT:    umull2 v26.4s, v0.8h, v26.8h
-; CHECK-NEXT:    eor v27.16b, v27.16b, v29.16b
-; CHECK-NEXT:    eor v24.16b, v24.16b, v25.16b
-; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v9.8h
+; CHECK-NEXT:    umull2 v27.4s, v0.8h, v27.8h
+; CHECK-NEXT:    umull2 v28.4s, v0.8h, v28.8h
+; CHECK-NEXT:    uzp1 v10.8h, v0.8h, v23.8h
+; CHECK-NEXT:    movi v23.4s, #32, lsl #8
+; CHECK-NEXT:    and v8.16b, v2.16b, v16.16b
+; CHECK-NEXT:    and v9.16b, v2.16b, v18.16b
 ; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v30.8h
-; CHECK-NEXT:    movi v9.4s, #64, lsl #8
-; CHECK-NEXT:    umull2 v23.4s, v0.8h, v23.8h
+; CHECK-NEXT:    uzp1 v31.8h, v0.8h, v31.8h
+; CHECK-NEXT:    and v11.16b, v2.16b, v22.16b
+; CHECK-NEXT:    eor v24.16b, v24.16b, v29.16b
+; CHECK-NEXT:    xtn v12.4h, v19.4s
+; CHECK-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
+; CHECK-NEXT:    eor v25.16b, v25.16b, v26.16b
+; CHECK-NEXT:    eor v26.16b, v27.16b, v28.16b
+; CHECK-NEXT:    uzp1 v9.8h, v0.8h, v9.8h
+; CHECK-NEXT:    and v29.16b, v2.16b, v23.16b
+; CHECK-NEXT:    umull2 v27.4s, v0.8h, v10.8h
+; CHECK-NEXT:    umull2 v28.4s, v0.8h, v30.8h
+; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v11.8h
+; CHECK-NEXT:    umull2 v31.4s, v0.8h, v31.8h
+; CHECK-NEXT:    and v11.16b, v1.16b, v17.16b
+; CHECK-NEXT:    eor v17.16b, v24.16b, v25.16b
+; CHECK-NEXT:    and v10.16b, v1.16b, v21.16b
+; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v29.8h
 ; CHECK-NEXT:    umull2 v8.4s, v0.8h, v8.8h
-; CHECK-NEXT:    movi v25.4s, #4, lsl #8
-; CHECK-NEXT:    eor v22.16b, v22.16b, v24.16b
-; CHECK-NEXT:    eor v19.16b, v28.16b, v31.16b
-; CHECK-NEXT:    movi v28.4s, #128, lsl #8
-; CHECK-NEXT:    eor v24.16b, v27.16b, v26.16b
-; CHECK-NEXT:    and v20.16b, v2.16b, v20.16b
-; CHECK-NEXT:    xtn v6.4h, v6.4s
+; CHECK-NEXT:    movi v21.4s, #4, lsl #8
+; CHECK-NEXT:    umull2 v9.4s, v0.8h, v9.8h
+; CHECK-NEXT:    eor v19.16b, v26.16b, v27.16b
+; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT:    umull2 v24.4s, v0.8h, v30.8h
+; CHECK-NEXT:    eor v25.16b, v28.16b, v31.16b
+; CHECK-NEXT:    xtn v28.4h, v11.4s
+; CHECK-NEXT:    xtn v30.4h, v20.4s
+; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
 ; CHECK-NEXT:    umull2 v27.4s, v0.8h, v29.8h
-; CHECK-NEXT:    umull2 v5.4s, v0.8h, v30.8h
-; CHECK-NEXT:    and v29.16b, v1.16b, v9.16b
-; CHECK-NEXT:    eor v23.16b, v23.16b, v8.16b
-; CHECK-NEXT:    and v26.16b, v1.16b, v25.16b
+; CHECK-NEXT:    xtn v10.4h, v10.4s
+; CHECK-NEXT:    and v29.16b, v2.16b, v21.16b
+; CHECK-NEXT:    eor v26.16b, v8.16b, v9.16b
+; CHECK-NEXT:    and v9.16b, v1.16b, v4.16b
+; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT:    eor v20.16b, v25.16b, v24.16b
+; CHECK-NEXT:    and v25.16b, v1.16b, v5.16b
+; CHECK-NEXT:    umull v28.4s, v0.4h, v28.4h
+; CHECK-NEXT:    umull v30.4s, v0.4h, v30.4h
+; CHECK-NEXT:    movi v24.4s, #64, lsl #8
 ; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v1.16b, v1.16b, v28.16b
-; CHECK-NEXT:    and v4.16b, v2.16b, v4.16b
+; CHECK-NEXT:    eor v4.16b, v26.16b, v27.16b
+; CHECK-NEXT:    and v26.16b, v1.16b, v6.16b
+; CHECK-NEXT:    xtn v27.4h, v9.4s
+; CHECK-NEXT:    xtn v25.4h, v25.4s
+; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT:    xtn v16.4h, v16.4s
 ; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    and v3.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v21.16b, v2.16b, v21.16b
-; CHECK-NEXT:    eor v23.16b, v23.16b, v27.16b
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v29.8h
-; CHECK-NEXT:    eor v29.16b, v5.16b, v5.16b
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v26.8h
-; CHECK-NEXT:    uzp1 v1.8h, v0.8h, v1.8h
-; CHECK-NEXT:    xtn v4.4h, v4.4s
+; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT:    uzp1 v5.8h, v0.8h, v29.8h
+; CHECK-NEXT:    xtn v26.4h, v26.4s
+; CHECK-NEXT:    eor v28.16b, v28.16b, v30.16b
+; CHECK-NEXT:    movi v30.4s, #128, lsl #8
+; CHECK-NEXT:    umull v27.4s, v0.4h, v27.4h
+; CHECK-NEXT:    and v29.16b, v2.16b, v24.16b
 ; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    umull v6.4s, v0.4h, v6.4h
+; CHECK-NEXT:    umull v25.4s, v0.4h, v25.4h
+; CHECK-NEXT:    xtn v22.4h, v22.4s
+; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT:    xtn v23.4h, v23.4s
+; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
+; CHECK-NEXT:    umull v31.4s, v0.4h, v12.4h
+; CHECK-NEXT:    umull v8.4s, v0.4h, v10.4h
+; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    umull v26.4s, v0.4h, v26.4h
 ; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    eor v29.16b, v29.16b, v5.16b
-; CHECK-NEXT:    and v25.16b, v2.16b, v25.16b
+; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
 ; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v27.8h
-; CHECK-NEXT:    umull v20.4s, v0.4h, v20.4h
+; CHECK-NEXT:    eor v25.16b, v25.16b, v27.16b
+; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v29.8h
+; CHECK-NEXT:    and v2.16b, v2.16b, v30.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v30.16b
 ; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    umull2 v26.4s, v0.8h, v26.8h
-; CHECK-NEXT:    and v30.16b, v2.16b, v9.16b
-; CHECK-NEXT:    umull2 v1.4s, v0.8h, v1.8h
-; CHECK-NEXT:    eor v29.16b, v29.16b, v5.16b
-; CHECK-NEXT:    xtn v25.4h, v25.4s
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    and v2.16b, v2.16b, v28.16b
+; CHECK-NEXT:    xtn v24.4h, v24.4s
 ; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v23.16b, v23.16b, v27.16b
-; CHECK-NEXT:    xtn v27.4h, v30.4s
-; CHECK-NEXT:    eor v7.16b, v18.16b, v20.16b
-; CHECK-NEXT:    eor v24.16b, v24.16b, v26.16b
-; CHECK-NEXT:    eor v26.16b, v29.16b, v5.16b
-; CHECK-NEXT:    umull v18.4s, v0.4h, v21.4h
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    eor v16.16b, v16.16b, v17.16b
-; CHECK-NEXT:    umull v17.4s, v0.4h, v25.4h
-; CHECK-NEXT:    eor v1.16b, v23.16b, v1.16b
-; CHECK-NEXT:    eor v4.16b, v6.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v19.16b, v3.16b
-; CHECK-NEXT:    eor v20.16b, v22.16b, v24.16b
-; CHECK-NEXT:    eor v21.16b, v26.16b, v5.16b
-; CHECK-NEXT:    umull v6.4s, v0.4h, v27.4h
-; CHECK-NEXT:    eor v7.16b, v7.16b, v18.16b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT:    eor v3.16b, v16.16b, v3.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v20.16b, v1.16b
-; CHECK-NEXT:    eor v18.16b, v21.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v7.16b, v6.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v18.16b
-; CHECK-NEXT:    eor v6.16b, v18.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v6.16b
+; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
+; CHECK-NEXT:    umull v23.4s, v0.4h, v23.4h
+; CHECK-NEXT:    eor v6.16b, v8.16b, v31.16b
+; CHECK-NEXT:    eor v7.16b, v26.16b, v7.16b
+; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    eor v16.16b, v16.16b, v18.16b
+; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    umull2 v5.4s, v0.8h, v5.8h
+; CHECK-NEXT:    umull2 v18.4s, v0.8h, v27.8h
+; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
+; CHECK-NEXT:    umull v24.4s, v0.4h, v24.4h
+; CHECK-NEXT:    eor v6.16b, v6.16b, v28.16b
+; CHECK-NEXT:    eor v3.16b, v25.16b, v3.16b
+; CHECK-NEXT:    eor v7.16b, v7.16b, v22.16b
+; CHECK-NEXT:    eor v16.16b, v16.16b, v23.16b
+; CHECK-NEXT:    eor v17.16b, v17.16b, v19.16b
+; CHECK-NEXT:    umull2 v2.4s, v0.8h, v2.8h
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    eor v5.16b, v20.16b, v5.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v18.16b
+; CHECK-NEXT:    eor v1.16b, v6.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v7.16b, v21.16b
+; CHECK-NEXT:    eor v6.16b, v16.16b, v24.16b
+; CHECK-NEXT:    eor v5.16b, v17.16b, v5.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    eor v3.16b, v1.16b, v3.16b
+; CHECK-NEXT:    eor v0.16b, v6.16b, v0.16b
+; CHECK-NEXT:    eor v1.16b, v5.16b, v2.16b
 ; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    ldp d9, d8, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d12, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %zextx = zext <8 x i16> %x to <8 x i32>
   %zexty = zext <8 x i16> %y to <8 x i32>
@@ -2136,205 +2124,194 @@ define <2 x i64> @clmul_v2i64_neon_zext(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-NEON-LABEL: clmul_v2i64_neon_zext:
 ; CHECK-NEON:       // %bb.0:
 ; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEON-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEON-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEON-NEXT:    dup v2.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEON-NEXT:    dup v4.2d, x9
+; CHECK-NEON-NEXT:    dup v5.2d, x9
 ; CHECK-NEON-NEXT:    dup v3.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8 // =0x8
-; CHECK-NEON-NEXT:    mov w9, #32 // =0x20
-; CHECK-NEON-NEXT:    dup v5.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEON-NEXT:    mov w9, #512 // =0x200
+; CHECK-NEON-NEXT:    dup v4.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v7.2d, x9
 ; CHECK-NEON-NEXT:    dup v6.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
 ; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    movi v24.2d, #0000000000000000
 ; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; CHECK-NEON-NEXT:    dup v7.2d, x8
+; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
 ; CHECK-NEON-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
+; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
 ; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
+; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
 ; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
 ; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
 ; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #512 // =0x200
 ; CHECK-NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
 ; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    fmov v26.2d, #2.00000000
 ; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
+; CHECK-NEON-NEXT:    dup v18.2d, x8
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
+; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
+; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
 ; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    fneg v24.2d, v24.2d
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
 ; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT:    and v26.16b, v1.16b, v26.16b
+; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
 ; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v5.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v17.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    xtn v4.2s, v16.2d
+; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v3.2d, x8
+; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    dup v18.2d, x9
+; CHECK-NEON-NEXT:    dup v19.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    xtn v3.2s, v5.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v16.16b
+; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v19.16b
 ; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
 ; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
+; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
 ; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v6.16b
-; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v6.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
 ; CHECK-NEON-NEXT:    dup v19.2d, x8
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
 ; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v16.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT:    dup v20.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
+; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    dup v21.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
 ; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    dup v5.2d, x8
+; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
+; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v6.2s
+; CHECK-NEON-NEXT:    dup v6.2d, x8
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
 ; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
+; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v17.16b, v3.16b
+; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
+; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
 ; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
 ; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
+; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
 ; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v18.16b
+; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
 ; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v19.2s, v20.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v21.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    movi v22.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    xtn v21.2s, v4.2d
-; CHECK-NEON-NEXT:    eor v3.16b, v6.16b, v17.16b
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    eor v5.16b, v7.16b, v5.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    xtn v6.2s, v18.2d
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    fneg v22.2d, v22.2d
+; CHECK-NEON-NEXT:    dup v23.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
+; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v19.2s
+; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
+; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
+; CHECK-NEON-NEXT:    eor v4.16b, v3.16b, v17.16b
+; CHECK-NEON-NEXT:    movi v23.4s, #128, lsl #24
+; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v20.16b
+; CHECK-NEON-NEXT:    xtn v5.2s, v7.2d
+; CHECK-NEON-NEXT:    dup v7.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
+; CHECK-NEON-NEXT:    xtn v19.2s, v21.2d
 ; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v21.2s
+; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
+; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
 ; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
 ; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    dup v23.2d, x8
+; CHECK-NEON-NEXT:    dup v22.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; CHECK-NEON-NEXT:    fneg v23.2d, v23.2d
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
+; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
+; CHECK-NEON-NEXT:    dup v20.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
 ; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
+; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
+; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
+; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    dup v19.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
+; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
+; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v24.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
 ; CHECK-NEON-NEXT:    dup v25.2d, x8
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
 ; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    xtn v23.2s, v23.2d
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT:    xtn v22.2s, v22.2d
+; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
+; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
+; CHECK-NEON-NEXT:    dup v26.2d, x8
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEON-NEXT:    xtn v18.2s, v22.2d
 ; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
+; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v24.16b
+; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v25.16b
+; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
 ; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v23.2d, v0.2s, v23.2s
-; CHECK-NEON-NEXT:    xtn v6.2s, v25.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v26.2d
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v17.16b
+; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v16.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v5.16b
+; CHECK-NEON-NEXT:    xtn v16.2s, v22.2d
+; CHECK-NEON-NEXT:    xtn v17.2s, v24.2d
+; CHECK-NEON-NEXT:    xtn v22.2s, v25.2d
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v19.2s
 ; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v21.16b
+; CHECK-NEON-NEXT:    eor v6.16b, v7.16b, v20.16b
 ; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v23.16b
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v18.16b
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v22.2s
+; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v4.16b, v6.16b, v18.16b
 ; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v16.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v5.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v7.16b, v6.16b
+; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
 ; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
 ; CHECK-NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
 ; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
@@ -2357,13 +2334,13 @@ define <2 x i64> @clmul_v2i64_neon_zext(<2 x i32> %x, <2 x i32> %y) {
 define <4 x i64> @clmul_v4i64_neon_zext(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEON-LABEL: clmul_v4i64_neon_zext:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    sub sp, sp, #320
-; CHECK-NEON-NEXT:    stp d15, d14, [sp, #240] // 16-byte Folded Spill
-; CHECK-NEON-NEXT:    stp d13, d12, [sp, #256] // 16-byte Folded Spill
-; CHECK-NEON-NEXT:    stp d11, d10, [sp, #272] // 16-byte Folded Spill
-; CHECK-NEON-NEXT:    stp d9, d8, [sp, #288] // 16-byte Folded Spill
-; CHECK-NEON-NEXT:    str x29, [sp, #304] // 8-byte Spill
-; CHECK-NEON-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEON-NEXT:    sub sp, sp, #288
+; CHECK-NEON-NEXT:    stp d15, d14, [sp, #208] // 16-byte Folded Spill
+; CHECK-NEON-NEXT:    stp d13, d12, [sp, #224] // 16-byte Folded Spill
+; CHECK-NEON-NEXT:    stp d11, d10, [sp, #240] // 16-byte Folded Spill
+; CHECK-NEON-NEXT:    stp d9, d8, [sp, #256] // 16-byte Folded Spill
+; CHECK-NEON-NEXT:    str x29, [sp, #272] // 8-byte Spill
+; CHECK-NEON-NEXT:    .cfi_def_cfa_offset 288
 ; CHECK-NEON-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEON-NEXT:    .cfi_offset b8, -24
 ; CHECK-NEON-NEXT:    .cfi_offset b9, -32
@@ -2373,406 +2350,355 @@ define <4 x i64> @clmul_v4i64_neon_zext(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEON-NEXT:    .cfi_offset b13, -64
 ; CHECK-NEON-NEXT:    .cfi_offset b14, -72
 ; CHECK-NEON-NEXT:    .cfi_offset b15, -80
+; CHECK-NEON-NEXT:    mov v27.16b, v1.16b
 ; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEON-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEON-NEXT:    ushll v6.2d, v1.2s, #0
-; CHECK-NEON-NEXT:    dup v23.2d, x8
-; CHECK-NEON-NEXT:    dup v22.2d, x9
-; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
-; CHECK-NEON-NEXT:    mov w10, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v28.2d, x8
-; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
-; CHECK-NEON-NEXT:    dup v24.2d, x10
-; CHECK-NEON-NEXT:    ushll2 v5.2d, v1.4s, #0
-; CHECK-NEON-NEXT:    dup v25.2d, x9
-; CHECK-NEON-NEXT:    and v3.16b, v6.16b, v23.16b
-; CHECK-NEON-NEXT:    and v4.16b, v6.16b, v22.16b
-; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEON-NEXT:    and v7.16b, v6.16b, v28.16b
-; CHECK-NEON-NEXT:    dup v26.2d, x8
+; CHECK-NEON-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEON-NEXT:    dup v1.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEON-NEXT:    dup v6.2d, x9
+; CHECK-NEON-NEXT:    dup v4.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #8 // =0x8
+; CHECK-NEON-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEON-NEXT:    ushll v5.2d, v27.2s, #0
+; CHECK-NEON-NEXT:    dup v2.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
+; CHECK-NEON-NEXT:    ldr x29, [sp, #272] // 8-byte Reload
+; CHECK-NEON-NEXT:    stp q4, q6, [sp, #160] // 32-byte Folded Spill
+; CHECK-NEON-NEXT:    and v3.16b, v5.16b, v1.16b
+; CHECK-NEON-NEXT:    and v4.16b, v5.16b, v4.16b
+; CHECK-NEON-NEXT:    and v17.16b, v5.16b, v6.16b
+; CHECK-NEON-NEXT:    and v19.16b, v5.16b, v2.16b
+; CHECK-NEON-NEXT:    dup v6.2d, x8
+; CHECK-NEON-NEXT:    str q2, [sp, #192] // 16-byte Spill
+; CHECK-NEON-NEXT:    dup v2.2d, x9
 ; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    and v17.16b, v6.16b, v24.16b
-; CHECK-NEON-NEXT:    and v16.16b, v6.16b, v25.16b
-; CHECK-NEON-NEXT:    and v25.16b, v5.16b, v25.16b
+; CHECK-NEON-NEXT:    mov w9, #128 // =0x80
 ; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
 ; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    mov w9, #256 // =0x100
-; CHECK-NEON-NEXT:    xtn v19.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
-; CHECK-NEON-NEXT:    xtn v29.2s, v17.2d
-; CHECK-NEON-NEXT:    and v27.16b, v6.16b, v26.16b
-; CHECK-NEON-NEXT:    ldr x29, [sp, #304] // 8-byte Reload
-; CHECK-NEON-NEXT:    and v24.16b, v5.16b, v24.16b
-; CHECK-NEON-NEXT:    and v26.16b, v5.16b, v26.16b
-; CHECK-NEON-NEXT:    uzp1 v25.4s, v0.4s, v25.4s
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
+; CHECK-NEON-NEXT:    xtn v20.2s, v17.2d
+; CHECK-NEON-NEXT:    xtn v21.2s, v19.2d
+; CHECK-NEON-NEXT:    and v22.16b, v5.16b, v6.16b
+; CHECK-NEON-NEXT:    stp q6, q1, [sp, #112] // 32-byte Folded Spill
+; CHECK-NEON-NEXT:    and v23.16b, v5.16b, v2.16b
 ; CHECK-NEON-NEXT:    dup v1.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #512 // =0x200
-; CHECK-NEON-NEXT:    dup v8.2d, x9
+; CHECK-NEON-NEXT:    dup v7.2d, x9
+; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
 ; CHECK-NEON-NEXT:    mov w9, #2048 // =0x800
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v29.2s
-; CHECK-NEON-NEXT:    uzp1 v29.4s, v0.4s, v24.4s
-; CHECK-NEON-NEXT:    uzp1 v26.4s, v0.4s, v26.4s
-; CHECK-NEON-NEXT:    and v30.16b, v5.16b, v18.16b
+; CHECK-NEON-NEXT:    str q2, [sp, #144] // 16-byte Spill
+; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
+; CHECK-NEON-NEXT:    umull v24.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
+; CHECK-NEON-NEXT:    xtn v25.2s, v22.2d
 ; CHECK-NEON-NEXT:    dup v2.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    stp q8, q1, [sp, #192] // 32-byte Folded Spill
+; CHECK-NEON-NEXT:    xtn v23.2s, v23.2d
+; CHECK-NEON-NEXT:    dup v6.2d, x9
+; CHECK-NEON-NEXT:    and v26.16b, v5.16b, v1.16b
+; CHECK-NEON-NEXT:    mov w9, #4096 // =0x1000
+; CHECK-NEON-NEXT:    str q1, [sp, #64] // 16-byte Spill
+; CHECK-NEON-NEXT:    mov w8, #512 // =0x200
+; CHECK-NEON-NEXT:    dup v1.2d, x9
+; CHECK-NEON-NEXT:    and v28.16b, v5.16b, v7.16b
+; CHECK-NEON-NEXT:    and v29.16b, v5.16b, v2.16b
 ; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    and v4.16b, v5.16b, v22.16b
-; CHECK-NEON-NEXT:    and v22.16b, v5.16b, v28.16b
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    and v31.16b, v5.16b, v1.16b
-; CHECK-NEON-NEXT:    and v8.16b, v5.16b, v8.16b
-; CHECK-NEON-NEXT:    str q3, [sp, #224] // 16-byte Spill
-; CHECK-NEON-NEXT:    and v3.16b, v5.16b, v23.16b
-; CHECK-NEON-NEXT:    umull2 v9.2d, v0.4s, v25.4s
-; CHECK-NEON-NEXT:    str q7, [sp, #96] // 16-byte Spill
-; CHECK-NEON-NEXT:    uzp1 v4.4s, v0.4s, v4.4s
-; CHECK-NEON-NEXT:    uzp1 v28.4s, v0.4s, v22.4s
-; CHECK-NEON-NEXT:    xtn v21.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v16.2d, x9
-; CHECK-NEON-NEXT:    mov w9, #32768 // =0x8000
-; CHECK-NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    eor v4.16b, v24.16b, v21.16b
+; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v25.2s
+; CHECK-NEON-NEXT:    and v25.16b, v5.16b, v6.16b
+; CHECK-NEON-NEXT:    dup v24.2d, x8
+; CHECK-NEON-NEXT:    umull v23.2d, v0.2s, v23.2s
+; CHECK-NEON-NEXT:    xtn v26.2s, v26.2d
+; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEON-NEXT:    stp q1, q6, [sp, #80] // 32-byte Folded Spill
+; CHECK-NEON-NEXT:    and v8.16b, v5.16b, v1.16b
+; CHECK-NEON-NEXT:    dup v1.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    uzp1 v30.4s, v0.4s, v30.4s
-; CHECK-NEON-NEXT:    umull2 v29.2d, v0.4s, v29.4s
-; CHECK-NEON-NEXT:    umull2 v26.2d, v0.4s, v26.4s
-; CHECK-NEON-NEXT:    stp q16, q2, [sp, #160] // 32-byte Folded Spill
-; CHECK-NEON-NEXT:    umull2 v4.2d, v0.4s, v4.4s
-; CHECK-NEON-NEXT:    umull2 v28.2d, v0.4s, v28.4s
-; CHECK-NEON-NEXT:    uzp1 v31.4s, v0.4s, v31.4s
-; CHECK-NEON-NEXT:    uzp1 v8.4s, v0.4s, v8.4s
-; CHECK-NEON-NEXT:    dup v23.2d, x8
+; CHECK-NEON-NEXT:    xtn v28.2s, v28.2d
+; CHECK-NEON-NEXT:    xtn v29.2s, v29.2d
+; CHECK-NEON-NEXT:    xtn v9.2s, v25.2d
+; CHECK-NEON-NEXT:    dup v25.2d, x8
+; CHECK-NEON-NEXT:    and v31.16b, v5.16b, v24.16b
+; CHECK-NEON-NEXT:    eor v30.16b, v3.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v21.16b, v23.16b
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v26.2s
+; CHECK-NEON-NEXT:    xtn v23.2s, v8.2d
 ; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
-; CHECK-NEON-NEXT:    umull2 v3.2d, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    and v10.16b, v5.16b, v17.16b
-; CHECK-NEON-NEXT:    umull2 v30.2d, v0.4s, v30.4s
-; CHECK-NEON-NEXT:    eor v29.16b, v29.16b, v26.16b
-; CHECK-NEON-NEXT:    dup v24.2d, x8
+; CHECK-NEON-NEXT:    and v11.16b, v5.16b, v1.16b
+; CHECK-NEON-NEXT:    str q1, [sp, #48] // 16-byte Spill
+; CHECK-NEON-NEXT:    and v12.16b, v5.16b, v25.16b
+; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v28.2s
+; CHECK-NEON-NEXT:    umull v8.2d, v0.2s, v29.2s
+; CHECK-NEON-NEXT:    xtn v10.2s, v31.2d
+; CHECK-NEON-NEXT:    dup v26.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    stp q17, q23, [sp, #128] // 32-byte Folded Spill
-; CHECK-NEON-NEXT:    eor v9.16b, v28.16b, v9.16b
-; CHECK-NEON-NEXT:    and v28.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    umull2 v31.2d, v0.4s, v31.4s
-; CHECK-NEON-NEXT:    umull2 v8.2d, v0.4s, v8.4s
-; CHECK-NEON-NEXT:    str q24, [sp, #112] // 16-byte Spill
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    and v4.16b, v5.16b, v2.16b
-; CHECK-NEON-NEXT:    uzp1 v10.4s, v0.4s, v10.4s
-; CHECK-NEON-NEXT:    and v13.16b, v5.16b, v23.16b
+; CHECK-NEON-NEXT:    umull v9.2d, v0.2s, v9.2s
+; CHECK-NEON-NEXT:    umull v23.2d, v0.2s, v23.2s
+; CHECK-NEON-NEXT:    mov w9, #131072 // =0x20000
+; CHECK-NEON-NEXT:    eor v31.16b, v3.16b, v4.16b
+; CHECK-NEON-NEXT:    xtn v4.2s, v12.2d
+; CHECK-NEON-NEXT:    stp q7, q2, [sp, #16] // 32-byte Folded Spill
+; CHECK-NEON-NEXT:    dup v29.2d, x8
+; CHECK-NEON-NEXT:    eor v3.16b, v21.16b, v8.16b
+; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEON-NEXT:    umull v8.2d, v0.2s, v10.2s
+; CHECK-NEON-NEXT:    and v10.16b, v5.16b, v26.16b
 ; CHECK-NEON-NEXT:    dup v1.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
-; CHECK-NEON-NEXT:    uzp1 v12.4s, v0.4s, v28.4s
-; CHECK-NEON-NEXT:    and v11.16b, v5.16b, v7.16b
-; CHECK-NEON-NEXT:    uzp1 v4.4s, v0.4s, v4.4s
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v9.16b
-; CHECK-NEON-NEXT:    eor v29.16b, v29.16b, v30.16b
-; CHECK-NEON-NEXT:    uzp1 v9.4s, v0.4s, v13.4s
-; CHECK-NEON-NEXT:    dup v22.2d, x9
-; CHECK-NEON-NEXT:    mov w9, #262144 // =0x40000
-; CHECK-NEON-NEXT:    eor v31.16b, v31.16b, v8.16b
-; CHECK-NEON-NEXT:    stp q17, q1, [sp, #64] // 32-byte Folded Spill
-; CHECK-NEON-NEXT:    umull2 v8.2d, v0.4s, v12.4s
-; CHECK-NEON-NEXT:    umull2 v10.2d, v0.4s, v10.4s
-; CHECK-NEON-NEXT:    umull2 v4.2d, v0.4s, v4.4s
-; CHECK-NEON-NEXT:    dup v2.2d, x9
-; CHECK-NEON-NEXT:    mov w9, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    str q22, [sp, #16] // 16-byte Spill
-; CHECK-NEON-NEXT:    uzp1 v30.4s, v0.4s, v11.4s
-; CHECK-NEON-NEXT:    eor v7.16b, v3.16b, v29.16b
-; CHECK-NEON-NEXT:    and v29.16b, v5.16b, v24.16b
-; CHECK-NEON-NEXT:    and v11.16b, v5.16b, v1.16b
-; CHECK-NEON-NEXT:    and v12.16b, v5.16b, v17.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
+; CHECK-NEON-NEXT:    xtn v11.2s, v11.2d
+; CHECK-NEON-NEXT:    eor v9.16b, v9.16b, v23.16b
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
+; CHECK-NEON-NEXT:    dup v23.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    eor v8.16b, v8.16b, v10.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v31.16b, v4.16b
-; CHECK-NEON-NEXT:    umull2 v31.2d, v0.4s, v9.4s
-; CHECK-NEON-NEXT:    and v9.16b, v5.16b, v22.16b
-; CHECK-NEON-NEXT:    uzp1 v10.4s, v0.4s, v29.4s
-; CHECK-NEON-NEXT:    uzp1 v11.4s, v0.4s, v11.4s
-; CHECK-NEON-NEXT:    uzp1 v12.4s, v0.4s, v12.4s
-; CHECK-NEON-NEXT:    stp q16, q2, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEON-NEXT:    and v13.16b, v5.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v14.2s, v27.2d
-; CHECK-NEON-NEXT:    umull2 v30.2d, v0.4s, v30.4s
-; CHECK-NEON-NEXT:    dup v1.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    uzp1 v9.4s, v0.4s, v9.4s
-; CHECK-NEON-NEXT:    eor v8.16b, v8.16b, v31.16b
-; CHECK-NEON-NEXT:    and v31.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    uzp1 v13.4s, v0.4s, v13.4s
-; CHECK-NEON-NEXT:    dup v29.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    xtn v14.2s, v10.2d
 ; CHECK-NEON-NEXT:    str q1, [sp] // 16-byte Spill
-; CHECK-NEON-NEXT:    umull2 v10.2d, v0.4s, v10.4s
-; CHECK-NEON-NEXT:    umull2 v11.2d, v0.4s, v11.4s
-; CHECK-NEON-NEXT:    umull2 v12.2d, v0.4s, v12.4s
-; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v30.16b
-; CHECK-NEON-NEXT:    umull v15.2d, v0.2s, v14.2s
-; CHECK-NEON-NEXT:    uzp1 v4.4s, v0.4s, v31.4s
-; CHECK-NEON-NEXT:    dup v31.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    and v3.16b, v5.16b, v1.16b
-; CHECK-NEON-NEXT:    and v1.16b, v5.16b, v29.16b
-; CHECK-NEON-NEXT:    umull2 v9.2d, v0.4s, v9.4s
-; CHECK-NEON-NEXT:    eor v14.16b, v20.16b, v21.16b
-; CHECK-NEON-NEXT:    umull2 v21.2d, v0.4s, v13.4s
-; CHECK-NEON-NEXT:    eor v20.16b, v8.16b, v10.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v7.16b, v2.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v11.16b, v12.16b
-; CHECK-NEON-NEXT:    eor v15.16b, v19.16b, v15.16b
-; CHECK-NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
-; CHECK-NEON-NEXT:    and v19.16b, v5.16b, v31.16b
-; CHECK-NEON-NEXT:    umull2 v8.2d, v0.4s, v4.4s
 ; CHECK-NEON-NEXT:    dup v10.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    eor v20.16b, v20.16b, v9.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v21.16b
-; CHECK-NEON-NEXT:    dup v9.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
 ; CHECK-NEON-NEXT:    dup v28.2d, x9
-; CHECK-NEON-NEXT:    uzp1 v19.4s, v0.4s, v19.4s
-; CHECK-NEON-NEXT:    umull2 v11.2d, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    umull2 v1.2d, v0.4s, v1.4s
-; CHECK-NEON-NEXT:    eor v25.16b, v2.16b, v20.16b
-; CHECK-NEON-NEXT:    movi v2.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    eor v23.16b, v7.16b, v8.16b
-; CHECK-NEON-NEXT:    dup v8.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
-; CHECK-NEON-NEXT:    and v12.16b, v5.16b, v9.16b
-; CHECK-NEON-NEXT:    dup v27.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    and v3.16b, v5.16b, v28.16b
-; CHECK-NEON-NEXT:    umull2 v7.2d, v0.4s, v19.4s
-; CHECK-NEON-NEXT:    eor v4.16b, v11.16b, v1.16b
+; CHECK-NEON-NEXT:    mov w9, #8388608 // =0x800000
+; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
+; CHECK-NEON-NEXT:    and v15.16b, v5.16b, v1.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v8.16b
+; CHECK-NEON-NEXT:    and v1.16b, v5.16b, v23.16b
+; CHECK-NEON-NEXT:    eor v4.16b, v9.16b, v4.16b
+; CHECK-NEON-NEXT:    umull v9.2d, v0.2s, v11.2s
 ; CHECK-NEON-NEXT:    and v11.16b, v5.16b, v10.16b
-; CHECK-NEON-NEXT:    fneg v30.2d, v2.2d
-; CHECK-NEON-NEXT:    dup v13.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
-; CHECK-NEON-NEXT:    and v1.16b, v5.16b, v27.16b
-; CHECK-NEON-NEXT:    and v2.16b, v5.16b, v8.16b
-; CHECK-NEON-NEXT:    dup v24.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    uzp1 v12.4s, v0.4s, v12.4s
-; CHECK-NEON-NEXT:    uzp1 v17.4s, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    eor v19.16b, v4.16b, v7.16b
-; CHECK-NEON-NEXT:    uzp1 v7.4s, v0.4s, v11.4s
-; CHECK-NEON-NEXT:    and v3.16b, v5.16b, v13.16b
-; CHECK-NEON-NEXT:    uzp1 v16.4s, v0.4s, v1.4s
-; CHECK-NEON-NEXT:    uzp1 v11.4s, v0.4s, v2.4s
-; CHECK-NEON-NEXT:    and v1.16b, v5.16b, v30.16b
-; CHECK-NEON-NEXT:    and v2.16b, v5.16b, v24.16b
-; CHECK-NEON-NEXT:    movi v22.2d, #0000000000000000
-; CHECK-NEON-NEXT:    and v4.16b, v6.16b, v18.16b
-; CHECK-NEON-NEXT:    umull2 v12.2d, v0.4s, v12.4s
-; CHECK-NEON-NEXT:    umull2 v18.2d, v0.4s, v17.4s
-; CHECK-NEON-NEXT:    dup v26.2d, x8
-; CHECK-NEON-NEXT:    umull2 v20.2d, v0.4s, v7.4s
-; CHECK-NEON-NEXT:    uzp1 v7.4s, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    uzp1 v3.4s, v0.4s, v1.4s
-; CHECK-NEON-NEXT:    umull2 v17.2d, v0.4s, v16.4s
-; CHECK-NEON-NEXT:    uzp1 v16.4s, v0.4s, v2.4s
-; CHECK-NEON-NEXT:    umull2 v11.2d, v0.4s, v11.4s
-; CHECK-NEON-NEXT:    ldp q1, q2, [sp, #192] // 32-byte Folded Reload
-; CHECK-NEON-NEXT:    xtn v21.2s, v4.2d
-; CHECK-NEON-NEXT:    eor v19.16b, v19.16b, v12.16b
-; CHECK-NEON-NEXT:    umull2 v7.2d, v0.4s, v7.4s
-; CHECK-NEON-NEXT:    umull2 v4.2d, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    and v3.16b, v5.16b, v26.16b
-; CHECK-NEON-NEXT:    and v12.16b, v6.16b, v2.16b
-; CHECK-NEON-NEXT:    uzp1 v2.4s, v0.4s, v22.4s
-; CHECK-NEON-NEXT:    ldr q22, [sp, #176] // 16-byte Reload
-; CHECK-NEON-NEXT:    and v1.16b, v6.16b, v1.16b
-; CHECK-NEON-NEXT:    eor v20.16b, v23.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v18.16b, v17.16b
-; CHECK-NEON-NEXT:    umull2 v16.2d, v0.4s, v16.4s
-; CHECK-NEON-NEXT:    eor v19.16b, v19.16b, v11.16b
-; CHECK-NEON-NEXT:    and v22.16b, v6.16b, v22.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v12.2d
-; CHECK-NEON-NEXT:    uzp1 v11.4s, v0.4s, v3.4s
-; CHECK-NEON-NEXT:    and v23.16b, v6.16b, v28.16b
+; CHECK-NEON-NEXT:    dup v8.2d, x9
+; CHECK-NEON-NEXT:    mov w9, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    and v12.16b, v5.16b, v29.16b
+; CHECK-NEON-NEXT:    and v13.16b, v5.16b, v28.16b
+; CHECK-NEON-NEXT:    eor v18.16b, v30.16b, v31.16b
+; CHECK-NEON-NEXT:    umull v14.2d, v0.2s, v14.2s
+; CHECK-NEON-NEXT:    xtn v30.2s, v11.2d
+; CHECK-NEON-NEXT:    xtn v3.2s, v1.2d
+; CHECK-NEON-NEXT:    dup v31.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
+; CHECK-NEON-NEXT:    and v1.16b, v5.16b, v8.16b
+; CHECK-NEON-NEXT:    dup v11.2d, x9
+; CHECK-NEON-NEXT:    mov w9, #33554432 // =0x2000000
+; CHECK-NEON-NEXT:    xtn v12.2s, v12.2d
+; CHECK-NEON-NEXT:    xtn v13.2s, v13.2d
+; CHECK-NEON-NEXT:    xtn v15.2s, v15.2d
+; CHECK-NEON-NEXT:    eor v7.16b, v2.16b, v9.16b
+; CHECK-NEON-NEXT:    eor v17.16b, v4.16b, v14.16b
+; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v30.2s
+; CHECK-NEON-NEXT:    dup v30.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
 ; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT:    umull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEON-NEXT:    eor v2.16b, v20.16b, v7.16b
-; CHECK-NEON-NEXT:    ldp q12, q20, [sp, #144] // 32-byte Folded Reload
-; CHECK-NEON-NEXT:    eor v4.16b, v17.16b, v4.16b
-; CHECK-NEON-NEXT:    ldr q17, [sp, #96] // 16-byte Reload
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    ldr q21, [sp, #128] // 16-byte Reload
-; CHECK-NEON-NEXT:    eor v16.16b, v19.16b, v16.16b
-; CHECK-NEON-NEXT:    xtn v19.2s, v22.2d
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    and v17.16b, v6.16b, v17.16b
-; CHECK-NEON-NEXT:    and v20.16b, v6.16b, v20.16b
-; CHECK-NEON-NEXT:    and v21.16b, v6.16b, v21.16b
-; CHECK-NEON-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    umull2 v22.2d, v0.4s, v11.4s
-; CHECK-NEON-NEXT:    ldr q11, [sp, #224] // 16-byte Reload
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    and v12.16b, v6.16b, v12.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v25.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
+; CHECK-NEON-NEXT:    and v9.16b, v5.16b, v31.16b
+; CHECK-NEON-NEXT:    and v14.16b, v5.16b, v11.16b
+; CHECK-NEON-NEXT:    umull v12.2d, v0.2s, v12.2s
+; CHECK-NEON-NEXT:    umull v13.2d, v0.2s, v13.2s
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    and v2.16b, v5.16b, v30.16b
+; CHECK-NEON-NEXT:    umull v15.2d, v0.2s, v15.2s
+; CHECK-NEON-NEXT:    eor v20.16b, v18.16b, v7.16b
+; CHECK-NEON-NEXT:    xtn v3.2s, v9.2d
+; CHECK-NEON-NEXT:    xtn v14.2s, v14.2d
+; CHECK-NEON-NEXT:    dup v9.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    ushll2 v1.2d, v27.4s, #0
+; CHECK-NEON-NEXT:    xtn v19.2s, v2.2d
+; CHECK-NEON-NEXT:    ldr q2, [sp, #128] // 16-byte Reload
+; CHECK-NEON-NEXT:    eor v13.16b, v12.16b, v13.16b
+; CHECK-NEON-NEXT:    dup v12.2d, x9
+; CHECK-NEON-NEXT:    mov w9, #134217728 // =0x8000000
+; CHECK-NEON-NEXT:    eor v18.16b, v17.16b, v15.16b
+; CHECK-NEON-NEXT:    umull v15.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v14.2s
+; CHECK-NEON-NEXT:    and v14.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ldp q2, q3, [sp, #160] // 32-byte Folded Reload
+; CHECK-NEON-NEXT:    eor v13.16b, v13.16b, v16.16b
+; CHECK-NEON-NEXT:    and v17.16b, v5.16b, v12.16b
+; CHECK-NEON-NEXT:    eor v16.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT:    and v6.16b, v5.16b, v9.16b
+; CHECK-NEON-NEXT:    dup v21.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
+; CHECK-NEON-NEXT:    dup v22.2d, x9
+; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    uzp1 v14.4s, v0.4s, v14.4s
+; CHECK-NEON-NEXT:    xtn v27.2s, v17.2d
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
+; CHECK-NEON-NEXT:    eor v7.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v17.16b, v20.16b, v18.16b
+; CHECK-NEON-NEXT:    and v20.16b, v5.16b, v21.16b
 ; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    eor v11.16b, v11.16b, v14.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v15.16b, v7.16b
-; CHECK-NEON-NEXT:    ldp d15, d14, [sp, #240] // 16-byte Folded Reload
-; CHECK-NEON-NEXT:    eor v1.16b, v18.16b, v1.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v22.16b
-; CHECK-NEON-NEXT:    ldr q22, [sp, #112] // 16-byte Reload
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    xtn v21.2s, v12.2d
-; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    and v22.16b, v6.16b, v22.16b
-; CHECK-NEON-NEXT:    ldp q25, q19, [sp, #64] // 32-byte Folded Reload
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v16.16b
-; CHECK-NEON-NEXT:    ldr q16, [sp, #16] // 16-byte Reload
-; CHECK-NEON-NEXT:    eor v7.16b, v11.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v18.16b, v20.16b
-; CHECK-NEON-NEXT:    ldr q20, [sp, #48] // 16-byte Reload
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    and v16.16b, v6.16b, v16.16b
-; CHECK-NEON-NEXT:    and v19.16b, v6.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v22.2s, v22.2d
-; CHECK-NEON-NEXT:    and v25.16b, v6.16b, v25.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    and v20.16b, v6.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v7.16b, v1.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v21.2s, v25.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEON-NEXT:    uzp1 v4.4s, v0.4s, v2.4s
+; CHECK-NEON-NEXT:    ldr q2, [sp, #192] // 16-byte Reload
+; CHECK-NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    umull2 v14.2d, v0.4s, v14.4s
+; CHECK-NEON-NEXT:    eor v13.16b, v13.16b, v15.16b
+; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v24.16b
+; CHECK-NEON-NEXT:    str q17, [sp, #192] // 16-byte Spill
+; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v27.2s
+; CHECK-NEON-NEXT:    and v27.16b, v5.16b, v22.16b
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v6.2s
 ; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
-; CHECK-NEON-NEXT:    ldr q20, [sp, #32] // 16-byte Reload
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    ldr q22, [sp] // 16-byte Reload
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    and v25.16b, v6.16b, v10.16b
-; CHECK-NEON-NEXT:    ldp d11, d10, [sp, #272] // 16-byte Folded Reload
-; CHECK-NEON-NEXT:    and v20.16b, v6.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    and v22.16b, v6.16b, v22.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v7.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v20.2d
-; CHECK-NEON-NEXT:    and v20.16b, v6.16b, v29.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    and v29.16b, v6.16b, v31.16b
-; CHECK-NEON-NEXT:    xtn v22.2s, v22.2d
+; CHECK-NEON-NEXT:    umull2 v6.2d, v0.4s, v4.4s
+; CHECK-NEON-NEXT:    eor v19.16b, v13.16b, v19.16b
+; CHECK-NEON-NEXT:    uzp1 v24.4s, v0.4s, v24.4s
+; CHECK-NEON-NEXT:    uzp1 v2.4s, v0.4s, v2.4s
+; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v25.16b
+; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
 ; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v16.16b, v19.16b, v21.16b
-; CHECK-NEON-NEXT:    xtn v19.2s, v20.2d
+; CHECK-NEON-NEXT:    umull2 v16.2d, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    ldr q3, [sp, #112] // 16-byte Reload
+; CHECK-NEON-NEXT:    xtn v20.2s, v27.2d
+; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    and v27.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    ldr q3, [sp, #144] // 16-byte Reload
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v14.16b
+; CHECK-NEON-NEXT:    ldp q13, q14, [sp, #64] // 32-byte Folded Reload
+; CHECK-NEON-NEXT:    umull2 v4.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT:    ldr q2, [sp, #16] // 16-byte Reload
+; CHECK-NEON-NEXT:    and v15.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v17.16b
+; CHECK-NEON-NEXT:    uzp1 v17.4s, v0.4s, v27.4s
+; CHECK-NEON-NEXT:    uzp1 v21.4s, v0.4s, v21.4s
+; CHECK-NEON-NEXT:    uzp1 v22.4s, v0.4s, v22.4s
+; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ldr q2, [sp, #32] // 16-byte Reload
+; CHECK-NEON-NEXT:    and v13.16b, v1.16b, v13.16b
+; CHECK-NEON-NEXT:    uzp1 v27.4s, v0.4s, v15.4s
+; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v4.16b
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    and v14.16b, v1.16b, v14.16b
+; CHECK-NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    uzp1 v20.4s, v0.4s, v13.4s
+; CHECK-NEON-NEXT:    ldr q13, [sp, #96] // 16-byte Reload
+; CHECK-NEON-NEXT:    umull2 v17.2d, v0.4s, v17.4s
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
+; CHECK-NEON-NEXT:    uzp1 v16.4s, v0.4s, v25.4s
+; CHECK-NEON-NEXT:    uzp1 v2.4s, v0.4s, v2.4s
+; CHECK-NEON-NEXT:    and v13.16b, v1.16b, v13.16b
+; CHECK-NEON-NEXT:    umull2 v27.2d, v0.4s, v27.4s
+; CHECK-NEON-NEXT:    uzp1 v14.4s, v0.4s, v14.4s
+; CHECK-NEON-NEXT:    eor v19.16b, v19.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v4.16b, v7.16b, v18.16b
+; CHECK-NEON-NEXT:    umull2 v15.2d, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    ldr q3, [sp, #48] // 16-byte Reload
+; CHECK-NEON-NEXT:    umull2 v7.2d, v0.4s, v20.4s
+; CHECK-NEON-NEXT:    uzp1 v13.4s, v0.4s, v13.4s
+; CHECK-NEON-NEXT:    umull2 v18.2d, v0.4s, v24.4s
+; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
+; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    umull2 v2.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v27.16b
+; CHECK-NEON-NEXT:    umull2 v24.2d, v0.4s, v14.4s
+; CHECK-NEON-NEXT:    and v26.16b, v1.16b, v28.16b
+; CHECK-NEON-NEXT:    umull2 v16.2d, v0.4s, v16.4s
+; CHECK-NEON-NEXT:    uzp1 v25.4s, v0.4s, v25.4s
+; CHECK-NEON-NEXT:    and v27.16b, v1.16b, v8.16b
+; CHECK-NEON-NEXT:    umull2 v21.2d, v0.4s, v21.4s
+; CHECK-NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    umull2 v20.2d, v0.4s, v13.4s
+; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v15.16b, v2.16b
+; CHECK-NEON-NEXT:    ldp d15, d14, [sp, #208] // 16-byte Folded Reload
+; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v29.16b
+; CHECK-NEON-NEXT:    umull2 v22.2d, v0.4s, v22.4s
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    ldr q7, [sp] // 16-byte Reload
+; CHECK-NEON-NEXT:    umull2 v3.2d, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v18.16b
+; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v10.16b
+; CHECK-NEON-NEXT:    eor v20.16b, v20.16b, v24.16b
+; CHECK-NEON-NEXT:    uzp1 v17.4s, v0.4s, v17.4s
+; CHECK-NEON-NEXT:    uzp1 v24.4s, v0.4s, v26.4s
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT:    and v26.16b, v1.16b, v31.16b
+; CHECK-NEON-NEXT:    uzp1 v18.4s, v0.4s, v18.4s
 ; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT:    uzp1 v23.4s, v0.4s, v27.4s
+; CHECK-NEON-NEXT:    eor v16.16b, v20.16b, v16.16b
+; CHECK-NEON-NEXT:    umull2 v20.2d, v0.4s, v25.4s
+; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v11.16b
+; CHECK-NEON-NEXT:    ldp d11, d10, [sp, #240] // 16-byte Folded Reload
+; CHECK-NEON-NEXT:    umull2 v17.2d, v0.4s, v17.4s
+; CHECK-NEON-NEXT:    umull2 v24.2d, v0.4s, v24.4s
+; CHECK-NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    uzp1 v7.4s, v0.4s, v7.4s
+; CHECK-NEON-NEXT:    and v27.16b, v1.16b, v12.16b
+; CHECK-NEON-NEXT:    ldp d13, d12, [sp, #224] // 16-byte Folded Reload
+; CHECK-NEON-NEXT:    umull2 v18.2d, v0.4s, v18.4s
+; CHECK-NEON-NEXT:    umull2 v23.2d, v0.4s, v23.4s
+; CHECK-NEON-NEXT:    uzp1 v25.4s, v0.4s, v25.4s
+; CHECK-NEON-NEXT:    eor v2.16b, v6.16b, v2.16b
+; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v20.16b
+; CHECK-NEON-NEXT:    dup v20.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
+; CHECK-NEON-NEXT:    eor v16.16b, v17.16b, v24.16b
+; CHECK-NEON-NEXT:    umull2 v3.2d, v0.4s, v3.4s
+; CHECK-NEON-NEXT:    uzp1 v24.4s, v0.4s, v27.4s
+; CHECK-NEON-NEXT:    umull2 v7.2d, v0.4s, v7.4s
+; CHECK-NEON-NEXT:    uzp1 v17.4s, v0.4s, v26.4s
+; CHECK-NEON-NEXT:    eor v18.16b, v18.16b, v23.16b
+; CHECK-NEON-NEXT:    umull2 v23.2d, v0.4s, v25.4s
+; CHECK-NEON-NEXT:    and v26.16b, v1.16b, v30.16b
+; CHECK-NEON-NEXT:    dup v25.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v3.16b
+; CHECK-NEON-NEXT:    dup v27.2d, x8
+; CHECK-NEON-NEXT:    umull2 v16.2d, v0.4s, v24.4s
+; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v9.16b
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    ldp d9, d8, [sp, #256] // 16-byte Folded Reload
+; CHECK-NEON-NEXT:    movi v7.4s, #128, lsl #24
+; CHECK-NEON-NEXT:    umull2 v17.2d, v0.4s, v17.4s
+; CHECK-NEON-NEXT:    uzp1 v26.4s, v0.4s, v26.4s
+; CHECK-NEON-NEXT:    eor v18.16b, v18.16b, v23.16b
+; CHECK-NEON-NEXT:    uzp1 v24.4s, v0.4s, v24.4s
+; CHECK-NEON-NEXT:    and v23.16b, v5.16b, v20.16b
+; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEON-NEXT:    fneg v7.2d, v7.2d
+; CHECK-NEON-NEXT:    eor v16.16b, v18.16b, v16.16b
+; CHECK-NEON-NEXT:    and v18.16b, v5.16b, v27.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v17.16b
+; CHECK-NEON-NEXT:    and v17.16b, v5.16b, v25.16b
+; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v25.16b
+; CHECK-NEON-NEXT:    and v27.16b, v1.16b, v27.16b
+; CHECK-NEON-NEXT:    umull2 v26.2d, v0.4s, v26.4s
+; CHECK-NEON-NEXT:    umull2 v24.2d, v0.4s, v24.4s
+; CHECK-NEON-NEXT:    xtn v23.2s, v23.2d
+; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
+; CHECK-NEON-NEXT:    and v5.16b, v5.16b, v7.16b
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT:    xtn v7.2s, v18.2d
+; CHECK-NEON-NEXT:    uzp1 v18.4s, v0.4s, v20.4s
+; CHECK-NEON-NEXT:    uzp1 v20.4s, v0.4s, v25.4s
+; CHECK-NEON-NEXT:    uzp1 v25.4s, v0.4s, v27.4s
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v26.16b
+; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v24.16b
+; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
+; CHECK-NEON-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-NEON-NEXT:    umull v23.2d, v0.2s, v23.2s
 ; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    xtn v20.2s, v25.2d
-; CHECK-NEON-NEXT:    xtn v21.2s, v29.2d
-; CHECK-NEON-NEXT:    and v25.16b, v6.16b, v9.16b
-; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    eor v22.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v25.2d
-; CHECK-NEON-NEXT:    and v1.16b, v6.16b, v13.16b
-; CHECK-NEON-NEXT:    and v25.16b, v6.16b, v27.16b
-; CHECK-NEON-NEXT:    ldp d13, d12, [sp, #256] // 16-byte Folded Reload
-; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    and v21.16b, v6.16b, v8.16b
-; CHECK-NEON-NEXT:    ldp d9, d8, [sp, #288] // 16-byte Folded Reload
-; CHECK-NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v18.16b, v18.16b, v19.16b
-; CHECK-NEON-NEXT:    and v19.16b, v6.16b, v24.16b
 ; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT:    and v24.16b, v6.16b, v26.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v18.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v20.2s, v25.2d
-; CHECK-NEON-NEXT:    movi v25.2d, #0000000000000000
-; CHECK-NEON-NEXT:    xtn v18.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v23.2d
-; CHECK-NEON-NEXT:    fmov v23.2d, #2.00000000
-; CHECK-NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    umull2 v18.2d, v0.4s, v18.4s
+; CHECK-NEON-NEXT:    umull2 v20.2d, v0.4s, v20.4s
+; CHECK-NEON-NEXT:    umull2 v24.2d, v0.4s, v25.4s
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v21.16b
+; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v22.16b
+; CHECK-NEON-NEXT:    ldr q16, [sp, #192] // 16-byte Reload
+; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
+; CHECK-NEON-NEXT:    umull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v23.16b
+; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v19.16b
 ; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v7.16b
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    and v21.16b, v6.16b, v30.16b
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    fneg v25.2d, v25.2d
-; CHECK-NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    and v26.16b, v6.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v16.16b, v1.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v24.2d
-; CHECK-NEON-NEXT:    eor v17.16b, v22.16b, v3.16b
-; CHECK-NEON-NEXT:    and v22.16b, v5.16b, v23.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    and v6.16b, v6.16b, v25.16b
-; CHECK-NEON-NEXT:    and v5.16b, v5.16b, v25.16b
-; CHECK-NEON-NEXT:    xtn v23.2s, v26.2d
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v18.16b, v19.16b, v20.16b
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT:    uzp1 v20.4s, v0.4s, v22.4s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    uzp1 v5.4s, v0.4s, v5.4s
-; CHECK-NEON-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v23.2s
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v18.16b, v18.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v7.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull2 v7.2d, v0.4s, v20.4s
-; CHECK-NEON-NEXT:    umull2 v0.2d, v0.4s, v5.4s
-; CHECK-NEON-NEXT:    eor v16.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v18.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v16.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v6.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v0.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    add sp, sp, #320
+; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v6.16b, v18.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v20.16b, v24.16b
+; CHECK-NEON-NEXT:    eor v4.16b, v16.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v5.16b, v7.16b, v5.16b
+; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v0.16b
+; CHECK-NEON-NEXT:    eor v0.16b, v4.16b, v5.16b
+; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    add sp, sp, #288
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-AES-LABEL: clmul_v4i64_neon_zext:
@@ -4598,120 +4524,75 @@ define <8 x i8> @clmulr_v8i8_neon(<8 x i8> %a, <8 x i8> %b) nounwind {
 define <8 x i16> @clmulr_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulr_v8i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.16b, #15
 ; CHECK-NEXT:    rev16 v1.16b, v1.16b
+; CHECK-NEXT:    rev16 v3.16b, v0.16b
+; CHECK-NEXT:    movi v2.8h, #2
+; CHECK-NEXT:    movi v4.8h, #1
+; CHECK-NEXT:    movi v5.8h, #4
+; CHECK-NEXT:    movi v6.8h, #8
+; CHECK-NEXT:    movi v7.8h, #16
+; CHECK-NEXT:    movi v16.8h, #32
+; CHECK-NEXT:    movi v17.8h, #128
+; CHECK-NEXT:    movi v18.8h, #1, lsl #8
+; CHECK-NEXT:    movi v19.8h, #8, lsl #8
+; CHECK-NEXT:    movi v20.8h, #16, lsl #8
+; CHECK-NEXT:    rbit v0.16b, v1.16b
+; CHECK-NEXT:    rbit v1.16b, v3.16b
+; CHECK-NEXT:    movi v3.8h, #64
+; CHECK-NEXT:    movi v21.8h, #2, lsl #8
+; CHECK-NEXT:    movi v22.8h, #32, lsl #8
+; CHECK-NEXT:    movi v23.8h, #4, lsl #8
+; CHECK-NEXT:    movi v24.8h, #64, lsl #8
+; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
+; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
+; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
+; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
+; CHECK-NEXT:    mul v2.8h, v1.8h, v2.8h
+; CHECK-NEXT:    mul v4.8h, v1.8h, v4.8h
+; CHECK-NEXT:    mul v5.8h, v1.8h, v5.8h
+; CHECK-NEXT:    mul v6.8h, v1.8h, v6.8h
+; CHECK-NEXT:    mul v7.8h, v1.8h, v7.8h
+; CHECK-NEXT:    mul v16.8h, v1.8h, v16.8h
+; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mul v17.8h, v1.8h, v17.8h
+; CHECK-NEXT:    mul v18.8h, v1.8h, v18.8h
+; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
+; CHECK-NEXT:    mul v19.8h, v1.8h, v19.8h
+; CHECK-NEXT:    mul v20.8h, v1.8h, v20.8h
+; CHECK-NEXT:    and v22.16b, v0.16b, v22.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT:    movi v6.8h, #128, lsl #8
+; CHECK-NEXT:    mul v3.8h, v1.8h, v3.8h
+; CHECK-NEXT:    mul v5.8h, v1.8h, v21.8h
+; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
+; CHECK-NEXT:    and v23.16b, v0.16b, v24.16b
+; CHECK-NEXT:    mul v22.8h, v1.8h, v22.8h
+; CHECK-NEXT:    eor v7.16b, v7.16b, v16.16b
+; CHECK-NEXT:    eor v16.16b, v17.16b, v18.16b
+; CHECK-NEXT:    eor v17.16b, v19.16b, v20.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v6.16b
+; CHECK-NEXT:    mul v4.8h, v1.8h, v21.8h
+; CHECK-NEXT:    mul v6.8h, v1.8h, v23.8h
+; CHECK-NEXT:    eor v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    eor v5.16b, v16.16b, v5.16b
+; CHECK-NEXT:    eor v7.16b, v17.16b, v22.16b
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v5.16b, v4.16b
+; CHECK-NEXT:    eor v3.16b, v7.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-NEXT:    movi v7.8h, #1
-; CHECK-NEXT:    movi v17.8h, #4
-; CHECK-NEXT:    movi v18.8h, #8
-; CHECK-NEXT:    movi v19.8h, #16
-; CHECK-NEXT:    movi v20.8h, #32
-; CHECK-NEXT:    movi v22.8h, #128
-; CHECK-NEXT:    movi v23.8h, #1, lsl #8
-; CHECK-NEXT:    movi v25.8h, #8, lsl #8
-; CHECK-NEXT:    movi v26.8h, #16, lsl #8
-; CHECK-NEXT:    ushr v3.8h, v1.8h, #4
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    ushr v4.8h, v0.8h, #4
-; CHECK-NEXT:    and v5.16b, v0.16b, v2.16b
-; CHECK-NEXT:    movi v0.16b, #51
-; CHECK-NEXT:    movi v21.8h, #64
-; CHECK-NEXT:    movi v27.8h, #32, lsl #8
-; CHECK-NEXT:    movi v24.8h, #4, lsl #8
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    shl v1.8h, v1.8h, #4
-; CHECK-NEXT:    and v4.16b, v4.16b, v2.16b
-; CHECK-NEXT:    shl v5.8h, v5.8h, #4
-; CHECK-NEXT:    orr v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    orr v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    ushr v4.8h, v1.8h, #2
-; CHECK-NEXT:    and v1.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ushr v5.8h, v3.8h, #2
-; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
-; CHECK-NEXT:    and v4.16b, v4.16b, v0.16b
-; CHECK-NEXT:    shl v6.8h, v1.8h, #2
-; CHECK-NEXT:    movi v1.16b, #85
-; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
-; CHECK-NEXT:    shl v3.8h, v3.8h, #2
-; CHECK-NEXT:    orr v4.16b, v4.16b, v6.16b
-; CHECK-NEXT:    movi v6.8h, #2
-; CHECK-NEXT:    orr v3.16b, v5.16b, v3.16b
-; CHECK-NEXT:    ushr v5.8h, v4.8h, #1
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    ushr v16.8h, v3.8h, #1
-; CHECK-NEXT:    and v3.16b, v3.16b, v1.16b
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    add v4.8h, v4.8h, v4.8h
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    add v3.8h, v3.8h, v3.8h
-; CHECK-NEXT:    orr v4.16b, v5.16b, v4.16b
-; CHECK-NEXT:    movi v5.8h, #2, lsl #8
-; CHECK-NEXT:    orr v3.16b, v16.16b, v3.16b
-; CHECK-NEXT:    movi v16.8h, #64, lsl #8
-; CHECK-NEXT:    and v6.16b, v4.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v4.16b, v7.16b
-; CHECK-NEXT:    and v17.16b, v4.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v4.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v4.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v4.16b, v20.16b
-; CHECK-NEXT:    and v22.16b, v4.16b, v22.16b
-; CHECK-NEXT:    and v23.16b, v4.16b, v23.16b
-; CHECK-NEXT:    and v25.16b, v4.16b, v25.16b
-; CHECK-NEXT:    and v26.16b, v4.16b, v26.16b
-; CHECK-NEXT:    mul v6.8h, v3.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v3.8h, v7.8h
-; CHECK-NEXT:    mul v17.8h, v3.8h, v17.8h
-; CHECK-NEXT:    mul v18.8h, v3.8h, v18.8h
-; CHECK-NEXT:    and v21.16b, v4.16b, v21.16b
-; CHECK-NEXT:    and v5.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v27.16b, v4.16b, v27.16b
-; CHECK-NEXT:    mul v19.8h, v3.8h, v19.8h
-; CHECK-NEXT:    mul v20.8h, v3.8h, v20.8h
-; CHECK-NEXT:    mul v22.8h, v3.8h, v22.8h
-; CHECK-NEXT:    mul v23.8h, v3.8h, v23.8h
-; CHECK-NEXT:    mul v25.8h, v3.8h, v25.8h
-; CHECK-NEXT:    mul v26.8h, v3.8h, v26.8h
-; CHECK-NEXT:    eor v6.16b, v7.16b, v6.16b
-; CHECK-NEXT:    movi v7.8h, #128, lsl #8
-; CHECK-NEXT:    eor v17.16b, v17.16b, v18.16b
-; CHECK-NEXT:    and v18.16b, v4.16b, v24.16b
-; CHECK-NEXT:    mul v21.8h, v3.8h, v21.8h
-; CHECK-NEXT:    mul v5.8h, v3.8h, v5.8h
-; CHECK-NEXT:    mul v24.8h, v3.8h, v27.8h
-; CHECK-NEXT:    and v16.16b, v4.16b, v16.16b
-; CHECK-NEXT:    eor v19.16b, v19.16b, v20.16b
-; CHECK-NEXT:    eor v20.16b, v22.16b, v23.16b
-; CHECK-NEXT:    eor v22.16b, v25.16b, v26.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    and v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    mul v7.8h, v3.8h, v18.8h
-; CHECK-NEXT:    mul v16.8h, v3.8h, v16.8h
-; CHECK-NEXT:    eor v17.16b, v19.16b, v21.16b
-; CHECK-NEXT:    eor v5.16b, v20.16b, v5.16b
-; CHECK-NEXT:    eor v18.16b, v22.16b, v24.16b
-; CHECK-NEXT:    mul v3.8h, v3.8h, v4.8h
-; CHECK-NEXT:    eor v4.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v18.16b, v16.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    rev16 v3.16b, v3.16b
-; CHECK-NEXT:    ushr v4.8h, v3.8h, #4
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    shl v3.8h, v3.8h, #4
-; CHECK-NEXT:    orr v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    ushr v3.8h, v2.8h, #2
-; CHECK-NEXT:    and v2.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    shl v2.8h, v2.8h, #2
-; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ushr v2.8h, v0.8h, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    add v0.8h, v0.8h, v0.8h
-; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ret
   %a.ext = zext <8 x i16> %a to <8 x i32>
   %b.ext = zext <8 x i16> %b to <8 x i32>
@@ -4966,202 +4847,191 @@ define <2 x i32> @clmulr_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
 ; CHECK-NEON-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEON-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEON-NEXT:    dup v2.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEON-NEXT:    dup v3.2d, x9
+; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8 // =0x8
-; CHECK-NEON-NEXT:    mov w9, #32 // =0x20
-; CHECK-NEON-NEXT:    dup v5.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v7.2d, x9
+; CHECK-NEON-NEXT:    dup v5.2d, x9
 ; CHECK-NEON-NEXT:    dup v6.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
 ; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    dup v7.2d, x8
 ; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
 ; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    movi v24.2d, #0000000000000000
+; CHECK-NEON-NEXT:    mov w9, #512 // =0x200
 ; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
 ; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
 ; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
 ; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
 ; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
+; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
 ; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #512 // =0x200
 ; CHECK-NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
 ; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
 ; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    fmov v26.2d, #2.00000000
+; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
+; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
 ; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    fneg v24.2d, v24.2d
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
 ; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT:    and v26.16b, v1.16b, v26.16b
+; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
 ; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v5.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v17.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    xtn v4.2s, v16.2d
+; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v3.2d, x8
+; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    dup v18.2d, x9
+; CHECK-NEON-NEXT:    dup v19.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    xtn v3.2s, v5.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v16.16b
+; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v19.16b
 ; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
 ; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
+; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
 ; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v6.16b
-; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v6.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
 ; CHECK-NEON-NEXT:    dup v19.2d, x8
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
 ; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v16.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT:    dup v20.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
+; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    dup v21.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
 ; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    dup v5.2d, x8
+; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
+; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v6.2s
+; CHECK-NEON-NEXT:    dup v6.2d, x8
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
 ; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
+; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v17.16b, v3.16b
+; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
+; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
 ; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
 ; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
+; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
 ; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v18.16b
+; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
 ; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v19.2s, v20.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v21.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    movi v22.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    xtn v21.2s, v4.2d
-; CHECK-NEON-NEXT:    eor v3.16b, v6.16b, v17.16b
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    eor v5.16b, v7.16b, v5.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    xtn v6.2s, v18.2d
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    fneg v22.2d, v22.2d
+; CHECK-NEON-NEXT:    dup v23.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
+; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v19.2s
+; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
+; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
+; CHECK-NEON-NEXT:    eor v4.16b, v3.16b, v17.16b
+; CHECK-NEON-NEXT:    movi v23.4s, #128, lsl #24
+; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v20.16b
+; CHECK-NEON-NEXT:    xtn v5.2s, v7.2d
+; CHECK-NEON-NEXT:    dup v7.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
+; CHECK-NEON-NEXT:    xtn v19.2s, v21.2d
 ; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v21.2s
+; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
+; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
 ; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
 ; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    dup v23.2d, x8
+; CHECK-NEON-NEXT:    dup v22.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; CHECK-NEON-NEXT:    fneg v23.2d, v23.2d
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
+; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
+; CHECK-NEON-NEXT:    dup v20.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
 ; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
+; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
+; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
+; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    dup v19.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
+; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
+; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v24.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
 ; CHECK-NEON-NEXT:    dup v25.2d, x8
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
 ; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    xtn v23.2s, v23.2d
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT:    xtn v22.2s, v22.2d
+; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
+; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
+; CHECK-NEON-NEXT:    dup v26.2d, x8
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEON-NEXT:    xtn v18.2s, v22.2d
 ; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
+; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v24.16b
+; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v25.16b
+; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
 ; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v23.2d, v0.2s, v23.2s
-; CHECK-NEON-NEXT:    xtn v6.2s, v25.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v26.2d
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v17.16b
+; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v16.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v5.16b
+; CHECK-NEON-NEXT:    xtn v16.2s, v22.2d
+; CHECK-NEON-NEXT:    xtn v17.2s, v24.2d
+; CHECK-NEON-NEXT:    xtn v22.2s, v25.2d
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v19.2s
 ; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v21.16b
+; CHECK-NEON-NEXT:    eor v6.16b, v7.16b, v20.16b
 ; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v23.16b
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v18.16b
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v22.2s
+; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v4.16b, v6.16b, v18.16b
 ; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v16.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v5.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v7.16b, v6.16b
+; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
 ; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
 ; CHECK-NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
 ; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
@@ -5242,120 +5112,75 @@ define <8 x i8> @clmulh_v8i8_neon(<8 x i8> %a, <8 x i8> %b) nounwind {
 define <8 x i16> @clmulh_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulh_v8i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.16b, #15
 ; CHECK-NEXT:    rev16 v1.16b, v1.16b
+; CHECK-NEXT:    rev16 v3.16b, v0.16b
+; CHECK-NEXT:    movi v2.8h, #2
+; CHECK-NEXT:    movi v4.8h, #1
+; CHECK-NEXT:    movi v5.8h, #4
+; CHECK-NEXT:    movi v6.8h, #8
+; CHECK-NEXT:    movi v7.8h, #16
+; CHECK-NEXT:    movi v16.8h, #32
+; CHECK-NEXT:    movi v17.8h, #128
+; CHECK-NEXT:    movi v18.8h, #1, lsl #8
+; CHECK-NEXT:    movi v19.8h, #8, lsl #8
+; CHECK-NEXT:    movi v20.8h, #16, lsl #8
+; CHECK-NEXT:    rbit v0.16b, v1.16b
+; CHECK-NEXT:    rbit v1.16b, v3.16b
+; CHECK-NEXT:    movi v3.8h, #64
+; CHECK-NEXT:    movi v21.8h, #2, lsl #8
+; CHECK-NEXT:    movi v22.8h, #32, lsl #8
+; CHECK-NEXT:    movi v23.8h, #4, lsl #8
+; CHECK-NEXT:    movi v24.8h, #64, lsl #8
+; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
+; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
+; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
+; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
+; CHECK-NEXT:    mul v2.8h, v1.8h, v2.8h
+; CHECK-NEXT:    mul v4.8h, v1.8h, v4.8h
+; CHECK-NEXT:    mul v5.8h, v1.8h, v5.8h
+; CHECK-NEXT:    mul v6.8h, v1.8h, v6.8h
+; CHECK-NEXT:    mul v7.8h, v1.8h, v7.8h
+; CHECK-NEXT:    mul v16.8h, v1.8h, v16.8h
+; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mul v17.8h, v1.8h, v17.8h
+; CHECK-NEXT:    mul v18.8h, v1.8h, v18.8h
+; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
+; CHECK-NEXT:    mul v19.8h, v1.8h, v19.8h
+; CHECK-NEXT:    mul v20.8h, v1.8h, v20.8h
+; CHECK-NEXT:    and v22.16b, v0.16b, v22.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT:    movi v6.8h, #128, lsl #8
+; CHECK-NEXT:    mul v3.8h, v1.8h, v3.8h
+; CHECK-NEXT:    mul v5.8h, v1.8h, v21.8h
+; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
+; CHECK-NEXT:    and v23.16b, v0.16b, v24.16b
+; CHECK-NEXT:    mul v22.8h, v1.8h, v22.8h
+; CHECK-NEXT:    eor v7.16b, v7.16b, v16.16b
+; CHECK-NEXT:    eor v16.16b, v17.16b, v18.16b
+; CHECK-NEXT:    eor v17.16b, v19.16b, v20.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v6.16b
+; CHECK-NEXT:    mul v4.8h, v1.8h, v21.8h
+; CHECK-NEXT:    mul v6.8h, v1.8h, v23.8h
+; CHECK-NEXT:    eor v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    eor v5.16b, v16.16b, v5.16b
+; CHECK-NEXT:    eor v7.16b, v17.16b, v22.16b
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v5.16b, v4.16b
+; CHECK-NEXT:    eor v3.16b, v7.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-NEXT:    movi v7.8h, #1
-; CHECK-NEXT:    movi v17.8h, #4
-; CHECK-NEXT:    movi v18.8h, #8
-; CHECK-NEXT:    movi v19.8h, #16
-; CHECK-NEXT:    movi v20.8h, #32
-; CHECK-NEXT:    movi v22.8h, #128
-; CHECK-NEXT:    movi v23.8h, #1, lsl #8
-; CHECK-NEXT:    movi v25.8h, #8, lsl #8
-; CHECK-NEXT:    movi v26.8h, #16, lsl #8
-; CHECK-NEXT:    ushr v3.8h, v1.8h, #4
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    ushr v4.8h, v0.8h, #4
-; CHECK-NEXT:    and v5.16b, v0.16b, v2.16b
-; CHECK-NEXT:    movi v0.16b, #51
-; CHECK-NEXT:    movi v21.8h, #64
-; CHECK-NEXT:    movi v27.8h, #32, lsl #8
-; CHECK-NEXT:    movi v24.8h, #4, lsl #8
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    shl v1.8h, v1.8h, #4
-; CHECK-NEXT:    and v4.16b, v4.16b, v2.16b
-; CHECK-NEXT:    shl v5.8h, v5.8h, #4
-; CHECK-NEXT:    orr v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    orr v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    ushr v4.8h, v1.8h, #2
-; CHECK-NEXT:    and v1.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ushr v5.8h, v3.8h, #2
-; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
-; CHECK-NEXT:    and v4.16b, v4.16b, v0.16b
-; CHECK-NEXT:    shl v6.8h, v1.8h, #2
-; CHECK-NEXT:    movi v1.16b, #85
-; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
-; CHECK-NEXT:    shl v3.8h, v3.8h, #2
-; CHECK-NEXT:    orr v4.16b, v4.16b, v6.16b
-; CHECK-NEXT:    movi v6.8h, #2
-; CHECK-NEXT:    orr v3.16b, v5.16b, v3.16b
-; CHECK-NEXT:    ushr v5.8h, v4.8h, #1
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    ushr v16.8h, v3.8h, #1
-; CHECK-NEXT:    and v3.16b, v3.16b, v1.16b
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    add v4.8h, v4.8h, v4.8h
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    add v3.8h, v3.8h, v3.8h
-; CHECK-NEXT:    orr v4.16b, v5.16b, v4.16b
-; CHECK-NEXT:    movi v5.8h, #2, lsl #8
-; CHECK-NEXT:    orr v3.16b, v16.16b, v3.16b
-; CHECK-NEXT:    movi v16.8h, #64, lsl #8
-; CHECK-NEXT:    and v6.16b, v4.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v4.16b, v7.16b
-; CHECK-NEXT:    and v17.16b, v4.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v4.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v4.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v4.16b, v20.16b
-; CHECK-NEXT:    and v22.16b, v4.16b, v22.16b
-; CHECK-NEXT:    and v23.16b, v4.16b, v23.16b
-; CHECK-NEXT:    and v25.16b, v4.16b, v25.16b
-; CHECK-NEXT:    and v26.16b, v4.16b, v26.16b
-; CHECK-NEXT:    mul v6.8h, v3.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v3.8h, v7.8h
-; CHECK-NEXT:    mul v17.8h, v3.8h, v17.8h
-; CHECK-NEXT:    mul v18.8h, v3.8h, v18.8h
-; CHECK-NEXT:    and v21.16b, v4.16b, v21.16b
-; CHECK-NEXT:    and v5.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v27.16b, v4.16b, v27.16b
-; CHECK-NEXT:    mul v19.8h, v3.8h, v19.8h
-; CHECK-NEXT:    mul v20.8h, v3.8h, v20.8h
-; CHECK-NEXT:    mul v22.8h, v3.8h, v22.8h
-; CHECK-NEXT:    mul v23.8h, v3.8h, v23.8h
-; CHECK-NEXT:    mul v25.8h, v3.8h, v25.8h
-; CHECK-NEXT:    mul v26.8h, v3.8h, v26.8h
-; CHECK-NEXT:    eor v6.16b, v7.16b, v6.16b
-; CHECK-NEXT:    movi v7.8h, #128, lsl #8
-; CHECK-NEXT:    eor v17.16b, v17.16b, v18.16b
-; CHECK-NEXT:    and v18.16b, v4.16b, v24.16b
-; CHECK-NEXT:    mul v21.8h, v3.8h, v21.8h
-; CHECK-NEXT:    mul v5.8h, v3.8h, v5.8h
-; CHECK-NEXT:    mul v24.8h, v3.8h, v27.8h
-; CHECK-NEXT:    and v16.16b, v4.16b, v16.16b
-; CHECK-NEXT:    eor v19.16b, v19.16b, v20.16b
-; CHECK-NEXT:    eor v20.16b, v22.16b, v23.16b
-; CHECK-NEXT:    eor v22.16b, v25.16b, v26.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    and v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    mul v7.8h, v3.8h, v18.8h
-; CHECK-NEXT:    mul v16.8h, v3.8h, v16.8h
-; CHECK-NEXT:    eor v17.16b, v19.16b, v21.16b
-; CHECK-NEXT:    eor v5.16b, v20.16b, v5.16b
-; CHECK-NEXT:    eor v18.16b, v22.16b, v24.16b
-; CHECK-NEXT:    mul v3.8h, v3.8h, v4.8h
-; CHECK-NEXT:    eor v4.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v18.16b, v16.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    rev16 v3.16b, v3.16b
-; CHECK-NEXT:    ushr v4.8h, v3.8h, #4
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    shl v3.8h, v3.8h, #4
-; CHECK-NEXT:    orr v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    ushr v3.8h, v2.8h, #2
-; CHECK-NEXT:    and v2.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    shl v2.8h, v2.8h, #2
-; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ushr v2.8h, v0.8h, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    add v0.8h, v0.8h, v0.8h
-; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
   %a.ext = zext <8 x i16> %a to <8 x i32>
@@ -5612,202 +5437,191 @@ define <2 x i32> @clmulh_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
 ; CHECK-NEON-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEON-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEON-NEXT:    dup v2.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEON-NEXT:    dup v3.2d, x9
+; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8 // =0x8
-; CHECK-NEON-NEXT:    mov w9, #32 // =0x20
-; CHECK-NEON-NEXT:    dup v5.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v7.2d, x9
+; CHECK-NEON-NEXT:    dup v5.2d, x9
 ; CHECK-NEON-NEXT:    dup v6.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
 ; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    dup v7.2d, x8
 ; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
 ; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    movi v24.2d, #0000000000000000
+; CHECK-NEON-NEXT:    mov w9, #512 // =0x200
 ; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
 ; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
 ; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
 ; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
 ; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
+; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
 ; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #512 // =0x200
 ; CHECK-NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
 ; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
 ; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    fmov v26.2d, #2.00000000
+; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
+; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
 ; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    fneg v24.2d, v24.2d
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
 ; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT:    and v26.16b, v1.16b, v26.16b
+; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
 ; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v5.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v17.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
+; CHECK-NEON-NEXT:    xtn v4.2s, v16.2d
+; CHECK-NEON-NEXT:    dup v16.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v3.2d, x8
+; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    dup v18.2d, x9
+; CHECK-NEON-NEXT:    dup v19.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    xtn v3.2s, v5.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v16.16b
+; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v19.16b
 ; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
 ; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
+; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
 ; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v6.16b
-; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v6.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
 ; CHECK-NEON-NEXT:    dup v19.2d, x8
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
 ; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v16.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT:    dup v20.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
+; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    dup v21.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
 ; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    dup v5.2d, x8
+; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
+; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v6.2s
+; CHECK-NEON-NEXT:    dup v6.2d, x8
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
 ; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
+; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v17.16b, v3.16b
+; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
+; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
 ; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
 ; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
+; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
+; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
 ; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v18.16b
+; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
 ; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v19.2s, v20.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v21.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    movi v22.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    xtn v21.2s, v4.2d
-; CHECK-NEON-NEXT:    eor v3.16b, v6.16b, v17.16b
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    eor v5.16b, v7.16b, v5.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    xtn v6.2s, v18.2d
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    fneg v22.2d, v22.2d
+; CHECK-NEON-NEXT:    dup v23.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
+; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v21.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v19.2s
+; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
+; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
+; CHECK-NEON-NEXT:    eor v4.16b, v3.16b, v17.16b
+; CHECK-NEON-NEXT:    movi v23.4s, #128, lsl #24
+; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v20.16b
+; CHECK-NEON-NEXT:    xtn v5.2s, v7.2d
+; CHECK-NEON-NEXT:    dup v7.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
+; CHECK-NEON-NEXT:    xtn v19.2s, v21.2d
 ; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v21.2s
+; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
+; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v18.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
 ; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
+; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
 ; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    dup v23.2d, x8
+; CHECK-NEON-NEXT:    dup v22.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; CHECK-NEON-NEXT:    fneg v23.2d, v23.2d
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
+; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
+; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
+; CHECK-NEON-NEXT:    dup v20.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
 ; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
+; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
+; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
+; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
 ; CHECK-NEON-NEXT:    dup v19.2d, x8
 ; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
+; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
+; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    dup v24.2d, x8
+; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
 ; CHECK-NEON-NEXT:    dup v25.2d, x8
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
 ; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    xtn v23.2s, v23.2d
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT:    xtn v22.2s, v22.2d
+; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
+; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
+; CHECK-NEON-NEXT:    dup v26.2d, x8
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
+; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEON-NEXT:    xtn v18.2s, v22.2d
 ; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
+; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v24.16b
+; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v25.16b
+; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
 ; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v23.2d, v0.2s, v23.2s
-; CHECK-NEON-NEXT:    xtn v6.2s, v25.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v26.2d
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v17.16b
+; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v16.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v5.16b
+; CHECK-NEON-NEXT:    xtn v16.2s, v22.2d
+; CHECK-NEON-NEXT:    xtn v17.2s, v24.2d
+; CHECK-NEON-NEXT:    xtn v22.2s, v25.2d
+; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
+; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v19.2s
 ; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v21.16b
+; CHECK-NEON-NEXT:    eor v6.16b, v7.16b, v20.16b
 ; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v16.16b, v16.16b, v23.16b
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v4.16b, v4.16b, v18.16b
+; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
+; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
+; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v22.2s
+; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v4.16b
+; CHECK-NEON-NEXT:    eor v4.16b, v6.16b, v18.16b
 ; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v16.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v5.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v7.16b, v6.16b
+; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
+; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
 ; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
 ; CHECK-NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
 ; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
diff --git a/llvm/test/CodeGen/AArch64/clmul-scalable.ll b/llvm/test/CodeGen/AArch64/clmul-scalable.ll
index df7b7542d6cac..d2c7f98854d91 100644
--- a/llvm/test/CodeGen/AArch64/clmul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-scalable.ll
@@ -1530,17 +1530,12 @@ define <vscale x 16 x i8> @clmul_nxv16i8_zext(<vscale x 16 x i4> %x, <vscale x 1
 ; CHECK-SVE-NEXT:    and z2.b, z2.b, #0x2
 ; CHECK-SVE-NEXT:    and z3.b, z3.b, #0x1
 ; CHECK-SVE-NEXT:    and z4.b, z4.b, #0x4
-; CHECK-SVE-NEXT:    mul z1.b, p0/m, z1.b, z0.b
 ; CHECK-SVE-NEXT:    mul z2.b, p0/m, z2.b, z0.b
 ; CHECK-SVE-NEXT:    mul z3.b, p0/m, z3.b, z0.b
 ; CHECK-SVE-NEXT:    mul z4.b, p0/m, z4.b, z0.b
-; CHECK-SVE-NEXT:    mul z0.b, z0.b, #0
-; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-SVE-NEXT:    eor z1.d, z4.d, z1.d
-; CHECK-SVE-NEXT:    eor z3.d, z0.d, z0.d
-; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
-; CHECK-SVE-NEXT:    eor z2.d, z3.d, z0.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-SVE-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; CHECK-SVE-NEXT:    eor z1.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    eor z0.d, z4.d, z0.d
 ; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
@@ -1555,17 +1550,12 @@ define <vscale x 16 x i8> @clmul_nxv16i8_zext(<vscale x 16 x i4> %x, <vscale x 1
 ; CHECK-SVE-AES-NEXT:    and z2.b, z2.b, #0x2
 ; CHECK-SVE-AES-NEXT:    and z3.b, z3.b, #0x1
 ; CHECK-SVE-AES-NEXT:    and z4.b, z4.b, #0x4
-; CHECK-SVE-AES-NEXT:    mul z1.b, p0/m, z1.b, z0.b
 ; CHECK-SVE-AES-NEXT:    mul z2.b, p0/m, z2.b, z0.b
 ; CHECK-SVE-AES-NEXT:    mul z3.b, p0/m, z3.b, z0.b
 ; CHECK-SVE-AES-NEXT:    mul z4.b, p0/m, z4.b, z0.b
-; CHECK-SVE-AES-NEXT:    mul z0.b, z0.b, #0
-; CHECK-SVE-AES-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z4.d, z1.d
-; CHECK-SVE-AES-NEXT:    eor z3.d, z0.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z2.d, z1.d
-; CHECK-SVE-AES-NEXT:    eor z2.d, z3.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-SVE-AES-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; CHECK-SVE-AES-NEXT:    eor z1.d, z3.d, z2.d
+; CHECK-SVE-AES-NEXT:    eor z0.d, z4.d, z0.d
 ; CHECK-SVE-AES-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-SVE-AES-NEXT:    ret
 ;
@@ -1611,9 +1601,9 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE-NEXT:    mov z5.d, z1.d
 ; CHECK-SVE-NEXT:    mov z6.d, z1.d
 ; CHECK-SVE-NEXT:    mov z7.d, z1.d
-; CHECK-SVE-NEXT:    mov z24.d, z1.d
 ; CHECK-SVE-NEXT:    and z0.h, z0.h, #0xff
-; CHECK-SVE-NEXT:    and z1.h, z1.h, #0x80
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    ptrue p0.h
 ; CHECK-SVE-NEXT:    and z2.h, z2.h, #0x2
 ; CHECK-SVE-NEXT:    and z3.h, z3.h, #0x1
 ; CHECK-SVE-NEXT:    and z4.h, z4.h, #0x4
@@ -1621,7 +1611,7 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE-NEXT:    and z6.h, z6.h, #0x10
 ; CHECK-SVE-NEXT:    and z7.h, z7.h, #0x20
 ; CHECK-SVE-NEXT:    and z24.h, z24.h, #0x40
-; CHECK-SVE-NEXT:    ptrue p0.h
+; CHECK-SVE-NEXT:    and z1.h, z1.h, #0x80
 ; CHECK-SVE-NEXT:    mul z2.h, p0/m, z2.h, z0.h
 ; CHECK-SVE-NEXT:    mul z3.h, p0/m, z3.h, z0.h
 ; CHECK-SVE-NEXT:    mul z4.h, p0/m, z4.h, z0.h
@@ -1629,22 +1619,13 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE-NEXT:    mul z6.h, p0/m, z6.h, z0.h
 ; CHECK-SVE-NEXT:    mul z7.h, p0/m, z7.h, z0.h
 ; CHECK-SVE-NEXT:    mul z24.h, p0/m, z24.h, z0.h
-; CHECK-SVE-NEXT:    mul z1.h, p0/m, z1.h, z0.h
-; CHECK-SVE-NEXT:    mul z0.h, z0.h, #0
+; CHECK-SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
 ; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
 ; CHECK-SVE-NEXT:    eor z4.d, z6.d, z7.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z5.d, z0.d, z0.d
 ; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-SVE-NEXT:    eor z3.d, z4.d, z24.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z4.d, z5.d, z0.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z4.d, z0.d
-; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
-; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
@@ -1656,9 +1637,9 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE-AES-NEXT:    mov z5.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mov z6.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mov z7.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z24.d, z1.d
 ; CHECK-SVE-AES-NEXT:    and z0.h, z0.h, #0xff
-; CHECK-SVE-AES-NEXT:    and z1.h, z1.h, #0x80
+; CHECK-SVE-AES-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-AES-NEXT:    ptrue p0.h
 ; CHECK-SVE-AES-NEXT:    and z2.h, z2.h, #0x2
 ; CHECK-SVE-AES-NEXT:    and z3.h, z3.h, #0x1
 ; CHECK-SVE-AES-NEXT:    and z4.h, z4.h, #0x4
@@ -1666,7 +1647,7 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE-AES-NEXT:    and z6.h, z6.h, #0x10
 ; CHECK-SVE-AES-NEXT:    and z7.h, z7.h, #0x20
 ; CHECK-SVE-AES-NEXT:    and z24.h, z24.h, #0x40
-; CHECK-SVE-AES-NEXT:    ptrue p0.h
+; CHECK-SVE-AES-NEXT:    and z1.h, z1.h, #0x80
 ; CHECK-SVE-AES-NEXT:    mul z2.h, p0/m, z2.h, z0.h
 ; CHECK-SVE-AES-NEXT:    mul z3.h, p0/m, z3.h, z0.h
 ; CHECK-SVE-AES-NEXT:    mul z4.h, p0/m, z4.h, z0.h
@@ -1674,22 +1655,13 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE-AES-NEXT:    mul z6.h, p0/m, z6.h, z0.h
 ; CHECK-SVE-AES-NEXT:    mul z7.h, p0/m, z7.h, z0.h
 ; CHECK-SVE-AES-NEXT:    mul z24.h, p0/m, z24.h, z0.h
-; CHECK-SVE-AES-NEXT:    mul z1.h, p0/m, z1.h, z0.h
-; CHECK-SVE-AES-NEXT:    mul z0.h, z0.h, #0
+; CHECK-SVE-AES-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-SVE-AES-NEXT:    eor z2.d, z3.d, z2.d
 ; CHECK-SVE-AES-NEXT:    eor z3.d, z4.d, z5.d
 ; CHECK-SVE-AES-NEXT:    eor z4.d, z6.d, z7.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z5.d, z0.d, z0.d
 ; CHECK-SVE-AES-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-SVE-AES-NEXT:    eor z3.d, z4.d, z24.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z4.d, z5.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z3.d, z4.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z2.d, z1.d
-; CHECK-SVE-AES-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-AES-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-SVE-AES-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-SVE-AES-NEXT:    ret
 ;
@@ -1719,15 +1691,10 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SME-STREAMING-NEXT:    and z1.h, z1.h, #0x40
 ; CHECK-SME-STREAMING-NEXT:    mul z3.h, z0.h, z3.h
 ; CHECK-SME-STREAMING-NEXT:    and z5.h, z5.h, #0x80
-; CHECK-SME-STREAMING-NEXT:    mul z1.h, z0.h, z1.h
 ; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z3.d, z4.d
 ; CHECK-SME-STREAMING-NEXT:    mul z3.h, z0.h, z5.h
-; CHECK-SME-STREAMING-NEXT:    mul z0.h, z0.h, #0
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SME-STREAMING-NEXT:    mul z0.h, z0.h, z1.h
+; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
 ; CHECK-SME-STREAMING-NEXT:    mov z0.d, z2.d
 ; CHECK-SME-STREAMING-NEXT:    ret
 ;
@@ -1757,15 +1724,10 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SME-STREAMING-SSVE-AES-NEXT:    and z1.h, z1.h, #0x40
 ; CHECK-SME-STREAMING-SSVE-AES-NEXT:    mul z3.h, z0.h, z3.h
 ; CHECK-SME-STREAMING-SSVE-AES-NEXT:    and z5.h, z5.h, #0x80
-; CHECK-SME-STREAMING-SSVE-AES-NEXT:    mul z1.h, z0.h, z1.h
 ; CHECK-SME-STREAMING-SSVE-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z4.d
 ; CHECK-SME-STREAMING-SSVE-AES-NEXT:    mul z3.h, z0.h, z5.h
-; CHECK-SME-STREAMING-SSVE-AES-NEXT:    mul z0.h, z0.h, #0
-; CHECK-SME-STREAMING-SSVE-AES-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
-; CHECK-SME-STREAMING-SSVE-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-SSVE-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-SSVE-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-SSVE-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SME-STREAMING-SSVE-AES-NEXT:    mul z0.h, z0.h, z1.h
+; CHECK-SME-STREAMING-SSVE-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
 ; CHECK-SME-STREAMING-SSVE-AES-NEXT:    mov z0.d, z2.d
 ; CHECK-SME-STREAMING-SSVE-AES-NEXT:    ret
 ;
@@ -1795,15 +1757,10 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0x40
 ; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z3.h
 ; CHECK-SVE2-NEXT:    and z5.h, z5.h, #0x80
-; CHECK-SVE2-NEXT:    mul z1.h, z0.h, z1.h
 ; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z3.d, z4.d
 ; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z5.h
-; CHECK-SVE2-NEXT:    mul z0.h, z0.h, #0
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    mul z0.h, z0.h, z1.h
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
 ; CHECK-SVE2-NEXT:    mov z0.d, z2.d
 ; CHECK-SVE2-NEXT:    ret
 ;
@@ -1833,15 +1790,10 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 ; CHECK-SVE2-AES-NEXT:    and z1.h, z1.h, #0x40
 ; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z3.h
 ; CHECK-SVE2-AES-NEXT:    and z5.h, z5.h, #0x80
-; CHECK-SVE2-AES-NEXT:    mul z1.h, z0.h, z1.h
 ; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z4.d
 ; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z5.h
-; CHECK-SVE2-AES-NEXT:    mul z0.h, z0.h, #0
-; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
-; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    mul z0.h, z0.h, z1.h
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
 ; CHECK-SVE2-AES-NEXT:    mov z0.d, z2.d
 ; CHECK-SVE2-AES-NEXT:    ret
   %zextx = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
@@ -1853,176 +1805,138 @@ define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8
 define <vscale x 4 x i32> @clmul_nxv4i32_zext(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
 ; CHECK-SVE-LABEL: clmul_nxv4i32_zext:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SVE-NEXT:    addvl sp, sp, #-1
-; CHECK-SVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-SVE-NEXT:    .cfi_offset w29, -16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
 ; CHECK-SVE-NEXT:    mov z2.d, z1.d
 ; CHECK-SVE-NEXT:    mov z3.d, z1.d
 ; CHECK-SVE-NEXT:    mov z4.d, z1.d
 ; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-SVE-NEXT:    ptrue p0.s
 ; CHECK-SVE-NEXT:    mov z6.d, z1.d
 ; CHECK-SVE-NEXT:    mov z7.d, z1.d
 ; CHECK-SVE-NEXT:    mov z24.d, z1.d
-; CHECK-SVE-NEXT:    mov z25.d, z1.d
-; CHECK-SVE-NEXT:    mov z26.d, z1.d
-; CHECK-SVE-NEXT:    mov z27.d, z1.d
-; CHECK-SVE-NEXT:    mov z28.d, z1.d
-; CHECK-SVE-NEXT:    mov z29.d, z1.d
-; CHECK-SVE-NEXT:    mov z30.d, z1.d
-; CHECK-SVE-NEXT:    mov z31.d, z1.d
-; CHECK-SVE-NEXT:    mov z8.d, z1.d
-; CHECK-SVE-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-SVE-NEXT:    and z2.s, z2.s, #0x2
 ; CHECK-SVE-NEXT:    and z3.s, z3.s, #0x1
 ; CHECK-SVE-NEXT:    and z4.s, z4.s, #0x4
 ; CHECK-SVE-NEXT:    and z5.s, z5.s, #0x8
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
 ; CHECK-SVE-NEXT:    and z6.s, z6.s, #0x10
 ; CHECK-SVE-NEXT:    and z7.s, z7.s, #0x20
-; CHECK-SVE-NEXT:    and z24.s, z24.s, #0x40
-; CHECK-SVE-NEXT:    and z25.s, z25.s, #0x80
-; CHECK-SVE-NEXT:    and z26.s, z26.s, #0x100
-; CHECK-SVE-NEXT:    and z27.s, z27.s, #0x200
-; CHECK-SVE-NEXT:    and z28.s, z28.s, #0x400
-; CHECK-SVE-NEXT:    and z29.s, z29.s, #0x800
-; CHECK-SVE-NEXT:    and z30.s, z30.s, #0x1000
-; CHECK-SVE-NEXT:    and z31.s, z31.s, #0x2000
-; CHECK-SVE-NEXT:    and z8.s, z8.s, #0x4000
-; CHECK-SVE-NEXT:    and z1.s, z1.s, #0x8000
-; CHECK-SVE-NEXT:    ptrue p0.s
 ; CHECK-SVE-NEXT:    mul z2.s, p0/m, z2.s, z0.s
 ; CHECK-SVE-NEXT:    mul z3.s, p0/m, z3.s, z0.s
+; CHECK-SVE-NEXT:    and z24.s, z24.s, #0x80
 ; CHECK-SVE-NEXT:    mul z4.s, p0/m, z4.s, z0.s
 ; CHECK-SVE-NEXT:    mul z5.s, p0/m, z5.s, z0.s
+; CHECK-SVE-NEXT:    and z25.s, z25.s, #0x100
 ; CHECK-SVE-NEXT:    mul z6.s, p0/m, z6.s, z0.s
 ; CHECK-SVE-NEXT:    mul z7.s, p0/m, z7.s, z0.s
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
 ; CHECK-SVE-NEXT:    mul z24.s, p0/m, z24.s, z0.s
+; CHECK-SVE-NEXT:    and z26.s, z26.s, #0x800
+; CHECK-SVE-NEXT:    and z27.s, z27.s, #0x1000
 ; CHECK-SVE-NEXT:    mul z25.s, p0/m, z25.s, z0.s
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-NEXT:    and z28.s, z28.s, #0x40
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
 ; CHECK-SVE-NEXT:    mul z26.s, p0/m, z26.s, z0.s
 ; CHECK-SVE-NEXT:    mul z27.s, p0/m, z27.s, z0.s
-; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    and z29.s, z29.s, #0x200
+; CHECK-SVE-NEXT:    and z30.s, z30.s, #0x2000
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
 ; CHECK-SVE-NEXT:    mul z28.s, p0/m, z28.s, z0.s
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    and z4.s, z4.s, #0x400
+; CHECK-SVE-NEXT:    and z1.s, z1.s, #0x8000
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-SVE-NEXT:    mul z29.s, p0/m, z29.s, z0.s
-; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
 ; CHECK-SVE-NEXT:    mul z30.s, p0/m, z30.s, z0.s
-; CHECK-SVE-NEXT:    mul z31.s, p0/m, z31.s, z0.s
-; CHECK-SVE-NEXT:    eor z4.d, z6.d, z7.d
-; CHECK-SVE-NEXT:    mul z8.s, p0/m, z8.s, z0.s
-; CHECK-SVE-NEXT:    mul z1.s, p0/m, z1.s, z0.s
-; CHECK-SVE-NEXT:    mul z0.s, z0.s, #0
-; CHECK-SVE-NEXT:    eor z5.d, z25.d, z26.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-NEXT:    eor z3.d, z4.d, z24.d
-; CHECK-SVE-NEXT:    eor z6.d, z29.d, z30.d
-; CHECK-SVE-NEXT:    eor z4.d, z5.d, z27.d
-; CHECK-SVE-NEXT:    eor z7.d, z0.d, z0.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-NEXT:    eor z5.d, z6.d, z31.d
-; CHECK-SVE-NEXT:    eor z3.d, z4.d, z28.d
-; CHECK-SVE-NEXT:    eor z6.d, z7.d, z0.d
-; CHECK-SVE-NEXT:    eor z4.d, z5.d, z8.d
-; CHECK-SVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-NEXT:    eor z5.d, z6.d, z0.d
-; CHECK-SVE-NEXT:    eor z1.d, z4.d, z1.d
-; CHECK-SVE-NEXT:    eor z3.d, z5.d, z0.d
-; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
-; CHECK-SVE-NEXT:    eor z2.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    and z5.s, z5.s, #0x4000
+; CHECK-SVE-NEXT:    eor z24.d, z26.d, z27.d
+; CHECK-SVE-NEXT:    mul z4.s, p0/m, z4.s, z0.s
+; CHECK-SVE-NEXT:    eor z3.d, z6.d, z28.d
+; CHECK-SVE-NEXT:    mul z5.s, p0/m, z5.s, z0.s
+; CHECK-SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-SVE-NEXT:    eor z6.d, z7.d, z29.d
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z30.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z2.d, z6.d, z4.d
+; CHECK-SVE-NEXT:    eor z3.d, z7.d, z5.d
 ; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
-; CHECK-SVE-NEXT:    eor z0.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
 ; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z0.d, z0.d, z6.d
-; CHECK-SVE-NEXT:    addvl sp, sp, #1
-; CHECK-SVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-SVE-AES-LABEL: clmul_nxv4i32_zext:
 ; CHECK-SVE-AES:       // %bb.0:
-; CHECK-SVE-AES-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SVE-AES-NEXT:    addvl sp, sp, #-1
-; CHECK-SVE-AES-NEXT:    str z8, [sp] // 16-byte Folded Spill
-; CHECK-SVE-AES-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-SVE-AES-NEXT:    .cfi_offset w29, -16
-; CHECK-SVE-AES-NEXT:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
 ; CHECK-SVE-AES-NEXT:    mov z2.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mov z3.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mov z4.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-AES-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-SVE-AES-NEXT:    ptrue p0.s
 ; CHECK-SVE-AES-NEXT:    mov z6.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mov z7.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mov z24.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z25.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z26.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z27.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z28.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z29.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z30.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z31.d, z1.d
-; CHECK-SVE-AES-NEXT:    mov z8.d, z1.d
-; CHECK-SVE-AES-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-SVE-AES-NEXT:    and z2.s, z2.s, #0x2
 ; CHECK-SVE-AES-NEXT:    and z3.s, z3.s, #0x1
 ; CHECK-SVE-AES-NEXT:    and z4.s, z4.s, #0x4
 ; CHECK-SVE-AES-NEXT:    and z5.s, z5.s, #0x8
+; CHECK-SVE-AES-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-AES-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-AES-NEXT:    mov z27.d, z1.d
 ; CHECK-SVE-AES-NEXT:    and z6.s, z6.s, #0x10
 ; CHECK-SVE-AES-NEXT:    and z7.s, z7.s, #0x20
-; CHECK-SVE-AES-NEXT:    and z24.s, z24.s, #0x40
-; CHECK-SVE-AES-NEXT:    and z25.s, z25.s, #0x80
-; CHECK-SVE-AES-NEXT:    and z26.s, z26.s, #0x100
-; CHECK-SVE-AES-NEXT:    and z27.s, z27.s, #0x200
-; CHECK-SVE-AES-NEXT:    and z28.s, z28.s, #0x400
-; CHECK-SVE-AES-NEXT:    and z29.s, z29.s, #0x800
-; CHECK-SVE-AES-NEXT:    and z30.s, z30.s, #0x1000
-; CHECK-SVE-AES-NEXT:    and z31.s, z31.s, #0x2000
-; CHECK-SVE-AES-NEXT:    and z8.s, z8.s, #0x4000
-; CHECK-SVE-AES-NEXT:    and z1.s, z1.s, #0x8000
-; CHECK-SVE-AES-NEXT:    ptrue p0.s
 ; CHECK-SVE-AES-NEXT:    mul z2.s, p0/m, z2.s, z0.s
 ; CHECK-SVE-AES-NEXT:    mul z3.s, p0/m, z3.s, z0.s
+; CHECK-SVE-AES-NEXT:    and z24.s, z24.s, #0x80
 ; CHECK-SVE-AES-NEXT:    mul z4.s, p0/m, z4.s, z0.s
 ; CHECK-SVE-AES-NEXT:    mul z5.s, p0/m, z5.s, z0.s
+; CHECK-SVE-AES-NEXT:    and z25.s, z25.s, #0x100
 ; CHECK-SVE-AES-NEXT:    mul z6.s, p0/m, z6.s, z0.s
 ; CHECK-SVE-AES-NEXT:    mul z7.s, p0/m, z7.s, z0.s
+; CHECK-SVE-AES-NEXT:    mov z28.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mul z24.s, p0/m, z24.s, z0.s
+; CHECK-SVE-AES-NEXT:    and z26.s, z26.s, #0x800
+; CHECK-SVE-AES-NEXT:    and z27.s, z27.s, #0x1000
 ; CHECK-SVE-AES-NEXT:    mul z25.s, p0/m, z25.s, z0.s
+; CHECK-SVE-AES-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-AES-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-AES-NEXT:    and z28.s, z28.s, #0x40
+; CHECK-SVE-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-AES-NEXT:    eor z3.d, z4.d, z5.d
 ; CHECK-SVE-AES-NEXT:    mul z26.s, p0/m, z26.s, z0.s
 ; CHECK-SVE-AES-NEXT:    mul z27.s, p0/m, z27.s, z0.s
-; CHECK-SVE-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-AES-NEXT:    and z29.s, z29.s, #0x200
+; CHECK-SVE-AES-NEXT:    and z30.s, z30.s, #0x2000
+; CHECK-SVE-AES-NEXT:    mov z5.d, z1.d
 ; CHECK-SVE-AES-NEXT:    mul z28.s, p0/m, z28.s, z0.s
+; CHECK-SVE-AES-NEXT:    eor z6.d, z6.d, z7.d
+; CHECK-SVE-AES-NEXT:    eor z7.d, z24.d, z25.d
+; CHECK-SVE-AES-NEXT:    and z4.s, z4.s, #0x400
+; CHECK-SVE-AES-NEXT:    and z1.s, z1.s, #0x8000
+; CHECK-SVE-AES-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-SVE-AES-NEXT:    mul z29.s, p0/m, z29.s, z0.s
-; CHECK-SVE-AES-NEXT:    eor z3.d, z4.d, z5.d
 ; CHECK-SVE-AES-NEXT:    mul z30.s, p0/m, z30.s, z0.s
-; CHECK-SVE-AES-NEXT:    mul z31.s, p0/m, z31.s, z0.s
-; CHECK-SVE-AES-NEXT:    eor z4.d, z6.d, z7.d
-; CHECK-SVE-AES-NEXT:    mul z8.s, p0/m, z8.s, z0.s
-; CHECK-SVE-AES-NEXT:    mul z1.s, p0/m, z1.s, z0.s
-; CHECK-SVE-AES-NEXT:    mul z0.s, z0.s, #0
-; CHECK-SVE-AES-NEXT:    eor z5.d, z25.d, z26.d
-; CHECK-SVE-AES-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-AES-NEXT:    eor z3.d, z4.d, z24.d
-; CHECK-SVE-AES-NEXT:    eor z6.d, z29.d, z30.d
-; CHECK-SVE-AES-NEXT:    eor z4.d, z5.d, z27.d
-; CHECK-SVE-AES-NEXT:    eor z7.d, z0.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-AES-NEXT:    eor z5.d, z6.d, z31.d
-; CHECK-SVE-AES-NEXT:    eor z3.d, z4.d, z28.d
-; CHECK-SVE-AES-NEXT:    eor z6.d, z7.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z4.d, z5.d, z8.d
-; CHECK-SVE-AES-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
-; CHECK-SVE-AES-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-SVE-AES-NEXT:    eor z5.d, z6.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z4.d, z1.d
-; CHECK-SVE-AES-NEXT:    eor z3.d, z5.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z1.d, z2.d, z1.d
-; CHECK-SVE-AES-NEXT:    eor z2.d, z3.d, z0.d
+; CHECK-SVE-AES-NEXT:    and z5.s, z5.s, #0x4000
+; CHECK-SVE-AES-NEXT:    eor z24.d, z26.d, z27.d
+; CHECK-SVE-AES-NEXT:    mul z4.s, p0/m, z4.s, z0.s
+; CHECK-SVE-AES-NEXT:    eor z3.d, z6.d, z28.d
+; CHECK-SVE-AES-NEXT:    mul z5.s, p0/m, z5.s, z0.s
+; CHECK-SVE-AES-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-SVE-AES-NEXT:    eor z6.d, z7.d, z29.d
+; CHECK-SVE-AES-NEXT:    eor z7.d, z24.d, z30.d
+; CHECK-SVE-AES-NEXT:    eor z1.d, z2.d, z3.d
+; CHECK-SVE-AES-NEXT:    eor z2.d, z6.d, z4.d
+; CHECK-SVE-AES-NEXT:    eor z3.d, z7.d, z5.d
 ; CHECK-SVE-AES-NEXT:    eor z1.d, z1.d, z2.d
-; CHECK-SVE-AES-NEXT:    eor z0.d, z2.d, z0.d
+; CHECK-SVE-AES-NEXT:    eor z0.d, z3.d, z0.d
 ; CHECK-SVE-AES-NEXT:    eor z0.d, z1.d, z0.d
-; CHECK-SVE-AES-NEXT:    eor z0.d, z0.d, z6.d
-; CHECK-SVE-AES-NEXT:    addvl sp, sp, #1
-; CHECK-SVE-AES-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-SVE-AES-NEXT:    ret
 ;
 ; CHECK-SME-STREAMING-LABEL: clmul_nxv4i32_zext:
@@ -2069,221 +1983,134 @@ define <vscale x 4 x i32> @clmul_nxv4i32_zext(<vscale x 4 x i16> %x, <vscale x 4
 define <vscale x 2 x i64> @clmul_nxv2i64_zext(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) {
 ; CHECK-SVE-LABEL: clmul_nxv2i64_zext:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SVE-NEXT:    addvl sp, sp, #-16
-; CHECK-SVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    addvl sp, sp, #-3
-; CHECK-SVE-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x98, 0x01, 0x1e, 0x22 // sp + 16 + 152 * VG
-; CHECK-SVE-NEXT:    .cfi_offset w29, -16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x40, 0x1c // $d10 @ cfa - 24 * VG - 16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x40, 0x1c // $d11 @ cfa - 32 * VG - 16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x40, 0x1c // $d12 @ cfa - 40 * VG - 16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x40, 0x1c // $d13 @ cfa - 48 * VG - 16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
-; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
-; CHECK-SVE-NEXT:    mov z6.d, z1.d
-; CHECK-SVE-NEXT:    mov z29.d, z1.d
-; CHECK-SVE-NEXT:    and z0.d, z0.d, #0xffffffff
-; CHECK-SVE-NEXT:    ptrue p0.d
-; CHECK-SVE-NEXT:    mov z30.d, z1.d
-; CHECK-SVE-NEXT:    mov z10.d, z1.d
-; CHECK-SVE-NEXT:    mov z11.d, z1.d
-; CHECK-SVE-NEXT:    mov z12.d, z1.d
 ; CHECK-SVE-NEXT:    mov z2.d, z1.d
-; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x10
-; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x200
 ; CHECK-SVE-NEXT:    mov z3.d, z1.d
-; CHECK-SVE-NEXT:    and z30.d, z30.d, #0x1000
-; CHECK-SVE-NEXT:    and z10.d, z10.d, #0x4000
 ; CHECK-SVE-NEXT:    mov z4.d, z1.d
-; CHECK-SVE-NEXT:    and z11.d, z11.d, #0x8000
-; CHECK-SVE-NEXT:    and z12.d, z12.d, #0x10000
-; CHECK-SVE-NEXT:    and z2.d, z2.d, #0x2
-; CHECK-SVE-NEXT:    movprfx z24, z0
-; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z6.d
-; CHECK-SVE-NEXT:    movprfx z6, z0
-; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z29.d
-; CHECK-SVE-NEXT:    mov z29.d, z1.d
-; CHECK-SVE-NEXT:    movprfx z9, z0
-; CHECK-SVE-NEXT:    mul z9.d, p0/m, z9.d, z30.d
-; CHECK-SVE-NEXT:    movprfx z30, z0
-; CHECK-SVE-NEXT:    mul z30.d, p0/m, z30.d, z10.d
 ; CHECK-SVE-NEXT:    mov z5.d, z1.d
-; CHECK-SVE-NEXT:    movprfx z10, z0
-; CHECK-SVE-NEXT:    mul z10.d, p0/m, z10.d, z12.d
-; CHECK-SVE-NEXT:    mul z2.d, p0/m, z2.d, z0.d
-; CHECK-SVE-NEXT:    mov z26.d, z1.d
-; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x800
-; CHECK-SVE-NEXT:    mov z27.d, z1.d
-; CHECK-SVE-NEXT:    mov z28.d, z1.d
-; CHECK-SVE-NEXT:    mov z12.d, z1.d
-; CHECK-SVE-NEXT:    mov z15.d, z1.d
-; CHECK-SVE-NEXT:    mov z16.d, z1.d
+; CHECK-SVE-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-SVE-NEXT:    ptrue p0.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    and z2.d, z2.d, #0x2
 ; CHECK-SVE-NEXT:    and z3.d, z3.d, #0x1
 ; CHECK-SVE-NEXT:    and z4.d, z4.d, #0x4
 ; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x8
-; CHECK-SVE-NEXT:    movprfx z8, z0
-; CHECK-SVE-NEXT:    mul z8.d, p0/m, z8.d, z29.d
-; CHECK-SVE-NEXT:    movprfx z29, z0
-; CHECK-SVE-NEXT:    mul z29.d, p0/m, z29.d, z11.d
-; CHECK-SVE-NEXT:    str z2, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    mov z11.d, z1.d
-; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x40
-; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x80
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x10
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x20
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x80
+; CHECK-SVE-NEXT:    mul z2.d, p0/m, z2.d, z0.d
 ; CHECK-SVE-NEXT:    mul z3.d, p0/m, z3.d, z0.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
 ; CHECK-SVE-NEXT:    mul z4.d, p0/m, z4.d, z0.d
-; CHECK-SVE-NEXT:    movprfx z7, z0
-; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z5.d
-; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x100
-; CHECK-SVE-NEXT:    and z12.d, z12.d, #0x40000
-; CHECK-SVE-NEXT:    and z15.d, z15.d, #0x100000
-; CHECK-SVE-NEXT:    movprfx z5, z0
-; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z26.d
-; CHECK-SVE-NEXT:    movprfx z26, z0
-; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z27.d
-; CHECK-SVE-NEXT:    and z11.d, z11.d, #0x20000
-; CHECK-SVE-NEXT:    and z16.d, z16.d, #0x200000
-; CHECK-SVE-NEXT:    mov z25.d, z1.d
-; CHECK-SVE-NEXT:    mov z31.d, z1.d
-; CHECK-SVE-NEXT:    movprfx z27, z0
-; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z28.d
-; CHECK-SVE-NEXT:    movprfx z14, z0
-; CHECK-SVE-NEXT:    mul z14.d, p0/m, z14.d, z12.d
-; CHECK-SVE-NEXT:    movprfx z12, z0
-; CHECK-SVE-NEXT:    mul z12.d, p0/m, z12.d, z15.d
-; CHECK-SVE-NEXT:    movprfx z19, z0
-; CHECK-SVE-NEXT:    mul z19.d, p0/m, z19.d, z11.d
-; CHECK-SVE-NEXT:    str z3, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x100
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x40
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x200
 ; CHECK-SVE-NEXT:    mov z28.d, z1.d
-; CHECK-SVE-NEXT:    mov z13.d, z1.d
-; CHECK-SVE-NEXT:    movprfx z11, z0
-; CHECK-SVE-NEXT:    mul z11.d, p0/m, z11.d, z16.d
-; CHECK-SVE-NEXT:    mov z17.d, z1.d
-; CHECK-SVE-NEXT:    mov z18.d, z1.d
-; CHECK-SVE-NEXT:    str z4, [sp] // 16-byte Folded Spill
-; CHECK-SVE-NEXT:    mov z15.d, z1.d
-; CHECK-SVE-NEXT:    mov z16.d, z1.d
-; CHECK-SVE-NEXT:    ldr z4, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    mov z20.d, z1.d
-; CHECK-SVE-NEXT:    mov z21.d, z1.d
-; CHECK-SVE-NEXT:    mov z22.d, z1.d
-; CHECK-SVE-NEXT:    mov z23.d, z1.d
-; CHECK-SVE-NEXT:    mov z2.d, z1.d
-; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x20
-; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x400
-; CHECK-SVE-NEXT:    and z31.d, z31.d, #0x2000
-; CHECK-SVE-NEXT:    and z13.d, z13.d, #0x80000
-; CHECK-SVE-NEXT:    and z17.d, z17.d, #0x400000
-; CHECK-SVE-NEXT:    and z18.d, z18.d, #0x800000
-; CHECK-SVE-NEXT:    and z15.d, z15.d, #0x1000000
 ; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-SVE-NEXT:    and z16.d, z16.d, #0x2000000
-; CHECK-SVE-NEXT:    and z20.d, z20.d, #0x4000000
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    eor z4.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x8000
+; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x100000
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x400
+; CHECK-SVE-NEXT:    eor z6.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z26.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x800
 ; CHECK-SVE-NEXT:    mul z28.d, p0/m, z28.d, z0.d
-; CHECK-SVE-NEXT:    mul z31.d, p0/m, z31.d, z0.d
-; CHECK-SVE-NEXT:    and z21.d, z21.d, #0x8000000
-; CHECK-SVE-NEXT:    mul z13.d, p0/m, z13.d, z0.d
-; CHECK-SVE-NEXT:    mul z17.d, p0/m, z17.d, z0.d
-; CHECK-SVE-NEXT:    and z22.d, z22.d, #0x10000000
-; CHECK-SVE-NEXT:    mul z18.d, p0/m, z18.d, z0.d
-; CHECK-SVE-NEXT:    mul z15.d, p0/m, z15.d, z0.d
-; CHECK-SVE-NEXT:    and z23.d, z23.d, #0x20000000
-; CHECK-SVE-NEXT:    and z2.d, z2.d, #0x40000000
-; CHECK-SVE-NEXT:    mul z16.d, p0/m, z16.d, z0.d
-; CHECK-SVE-NEXT:    mul z20.d, p0/m, z20.d, z0.d
-; CHECK-SVE-NEXT:    and z1.d, z1.d, #0x80000000
-; CHECK-SVE-NEXT:    mul z21.d, p0/m, z21.d, z0.d
-; CHECK-SVE-NEXT:    mul z22.d, p0/m, z22.d, z0.d
-; CHECK-SVE-NEXT:    mul z23.d, p0/m, z23.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z4.d, z3.d
-; CHECK-SVE-NEXT:    ldr z4, [sp] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    mul z2.d, p0/m, z2.d, z0.d
-; CHECK-SVE-NEXT:    mul z1.d, p0/m, z1.d, z0.d
-; CHECK-SVE-NEXT:    mul z0.d, z0.d, #0
+; CHECK-SVE-NEXT:    mul z29.d, p0/m, z29.d, z0.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    eor z4.d, z6.d, z27.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x1000
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x800000
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x2000
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x40000
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x1000000
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x20000
+; CHECK-SVE-NEXT:    eor z4.d, z7.d, z24.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z3.d, z3.d, #0x10000
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x400000
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z6.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mul z3.d, p0/m, z3.d, z0.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x4000
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x80000
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z5.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    eor z24.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x2000000
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z26.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
 ; CHECK-SVE-NEXT:    eor z4.d, z4.d, z7.d
-; CHECK-SVE-NEXT:    eor z7.d, z24.d, z25.d
-; CHECK-SVE-NEXT:    eor z24.d, z26.d, z27.d
-; CHECK-SVE-NEXT:    eor z25.d, z8.d, z9.d
-; CHECK-SVE-NEXT:    eor z26.d, z10.d, z19.d
-; CHECK-SVE-NEXT:    eor z27.d, z17.d, z18.d
-; CHECK-SVE-NEXT:    eor z2.d, z23.d, z2.d
-; CHECK-SVE-NEXT:    eor z8.d, z0.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
-; CHECK-SVE-NEXT:    eor z4.d, z7.d, z5.d
-; CHECK-SVE-NEXT:    eor z5.d, z24.d, z6.d
-; CHECK-SVE-NEXT:    eor z6.d, z25.d, z31.d
-; CHECK-SVE-NEXT:    eor z7.d, z26.d, z14.d
-; CHECK-SVE-NEXT:    eor z24.d, z27.d, z15.d
-; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
-; CHECK-SVE-NEXT:    eor z2.d, z8.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
-; CHECK-SVE-NEXT:    eor z4.d, z5.d, z28.d
-; CHECK-SVE-NEXT:    eor z5.d, z6.d, z30.d
-; CHECK-SVE-NEXT:    eor z6.d, z7.d, z13.d
-; CHECK-SVE-NEXT:    eor z7.d, z24.d, z16.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
-; CHECK-SVE-NEXT:    eor z4.d, z5.d, z29.d
-; CHECK-SVE-NEXT:    eor z5.d, z6.d, z12.d
-; CHECK-SVE-NEXT:    eor z6.d, z7.d, z20.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
-; CHECK-SVE-NEXT:    eor z4.d, z5.d, z11.d
-; CHECK-SVE-NEXT:    eor z5.d, z6.d, z21.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
-; CHECK-SVE-NEXT:    eor z4.d, z5.d, z22.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
-; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
-; CHECK-SVE-NEXT:    eor z1.d, z3.d, z1.d
-; CHECK-SVE-NEXT:    eor z3.d, z2.d, z0.d
-; CHECK-SVE-NEXT:    eor z1.d, z1.d, z3.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x4000000
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z27.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z6.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x200000
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z28.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x20000000
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x8000000
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z29.d
+; CHECK-SVE-NEXT:    eor z5.d, z7.d, z5.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x40000000
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    and z1.d, z1.d, #0x80000000
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z4.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x10000000
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z25.d
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z26.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z6.d
+; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z3.d, z24.d, z27.d
+; CHECK-SVE-NEXT:    eor z2.d, z4.d, z7.d
 ; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
 ; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
-; CHECK-SVE-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-SVE-NEXT:    addvl sp, sp, #3
-; CHECK-SVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-SVE-NEXT:    addvl sp, sp, #16
-; CHECK-SVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-SVE-AES-LABEL: clmul_nxv2i64_zext:
@@ -2405,27 +2232,10 @@ define <vscale x 2 x i64> @clmul_nxv2i64_zext(<vscale x 2 x i32> %x, <vscale x 2
 ; CHECK-SME-STREAMING-NEXT:    mul z3.d, z0.d, z3.d
 ; CHECK-SME-STREAMING-NEXT:    mul z4.d, z0.d, z6.d
 ; CHECK-SME-STREAMING-NEXT:    and z5.d, z5.d, #0x80000000
-; CHECK-SME-STREAMING-NEXT:    mul z1.d, z0.d, z1.d
 ; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
 ; CHECK-SME-STREAMING-NEXT:    mul z3.d, z0.d, z5.d
-; CHECK-SME-STREAMING-NEXT:    mul z0.d, z0.d, #0
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SME-STREAMING-NEXT:    mul z0.d, z0.d, z1.d
+; CHECK-SME-STREAMING-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
 ; CHECK-SME-STREAMING-NEXT:    mov z0.d, z2.d
 ; CHECK-SME-STREAMING-NEXT:    ret
 ;
@@ -2548,27 +2358,10 @@ define <vscale x 2 x i64> @clmul_nxv2i64_zext(<vscale x 2 x i32> %x, <vscale x 2
 ; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
 ; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z6.d
 ; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x80000000
-; CHECK-SVE2-NEXT:    mul z1.d, z0.d, z1.d
 ; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
 ; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    mul z0.d, z0.d, #0
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
-; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    mul z0.d, z0.d, z1.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
 ; CHECK-SVE2-NEXT:    mov z0.d, z2.d
 ; CHECK-SVE2-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/PowerPC/clmul-vector.ll b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
index 9089dca5b0ed7..f57dbeade4805 100644
--- a/llvm/test/CodeGen/PowerPC/clmul-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
@@ -2568,288 +2568,291 @@ define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; BE-LABEL: clmulr_v8i16:
 ; BE:       # %bb.0:
 ; BE-NEXT:    li 3, -80
-; BE-NEXT:    vspltish 4, 8
-; BE-NEXT:    vxor 5, 5, 5
+; BE-NEXT:    vspltisb 5, -1
 ; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
 ; BE-NEXT:    li 3, -64
-; BE-NEXT:    vadduhm 19, 4, 4
-; BE-NEXT:    vspltisb 1, -1
+; BE-NEXT:    vslh 16, 5, 5
+; BE-NEXT:    vspltish 4, 4
 ; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
 ; BE-NEXT:    li 3, -48
-; BE-NEXT:    vspltish 0, 2
+; BE-NEXT:    vspltish 8, 1
 ; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
 ; BE-NEXT:    li 3, -32
-; BE-NEXT:    vrlh 8, 2, 4
-; BE-NEXT:    vspltish 2, 4
+; BE-NEXT:    vspltish 14, 2
+; BE-NEXT:    vslh 0, 4, 4
 ; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
 ; BE-NEXT:    li 3, -16
+; BE-NEXT:    vspltish 15, 8
 ; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
 ; BE-NEXT:    addis 3, 2, .LCPI5_0 at toc@ha
 ; BE-NEXT:    addi 3, 3, .LCPI5_0 at toc@l
-; BE-NEXT:    vrlh 6, 3, 4
-; BE-NEXT:    vspltish 3, 1
-; BE-NEXT:    vslh 13, 1, 1
-; BE-NEXT:    vspltisb 1, 15
-; BE-NEXT:    vand 14, 8, 1
-; BE-NEXT:    vsrh 8, 8, 2
-; BE-NEXT:    vand 15, 6, 1
-; BE-NEXT:    vsrh 6, 6, 2
-; BE-NEXT:    vslh 14, 14, 2
-; BE-NEXT:    vand 8, 8, 1
-; BE-NEXT:    vslh 15, 15, 2
-; BE-NEXT:    vand 6, 6, 1
-; BE-NEXT:    vor 8, 8, 14
-; BE-NEXT:    vor 14, 6, 15
-; BE-NEXT:    lvx 6, 0, 3
+; BE-NEXT:    lvx 5, 0, 3
 ; BE-NEXT:    addis 3, 2, .LCPI5_1 at toc@ha
 ; BE-NEXT:    addi 3, 3, .LCPI5_1 at toc@l
-; BE-NEXT:    vand 15, 8, 6
-; BE-NEXT:    vsrh 8, 8, 0
-; BE-NEXT:    vslh 15, 15, 0
-; BE-NEXT:    vand 8, 8, 6
-; BE-NEXT:    vor 15, 8, 15
-; BE-NEXT:    lvx 8, 0, 3
+; BE-NEXT:    vsldoi 7, 8, 8, 1
+; BE-NEXT:    vperm 1, 2, 2, 5
+; BE-NEXT:    vspltisb 2, 4
+; BE-NEXT:    vperm 6, 3, 3, 5
+; BE-NEXT:    vspltisb 3, 15
+; BE-NEXT:    vsrb 10, 1, 2
+; BE-NEXT:    vand 1, 1, 3
+; BE-NEXT:    vslb 1, 1, 2
+; BE-NEXT:    vsrb 12, 6, 2
+; BE-NEXT:    vand 6, 6, 3
+; BE-NEXT:    vor 10, 10, 1
+; BE-NEXT:    lvx 1, 0, 3
 ; BE-NEXT:    addis 3, 2, .LCPI5_2 at toc@ha
 ; BE-NEXT:    addi 3, 3, .LCPI5_2 at toc@l
-; BE-NEXT:    lvx 31, 0, 3
+; BE-NEXT:    vslb 6, 6, 2
+; BE-NEXT:    vor 12, 12, 6
+; BE-NEXT:    vspltisb 6, 2
+; BE-NEXT:    vand 18, 12, 1
+; BE-NEXT:    vsrb 12, 12, 6
+; BE-NEXT:    vslb 18, 18, 6
+; BE-NEXT:    vand 12, 12, 1
+; BE-NEXT:    vand 17, 10, 1
+; BE-NEXT:    vsrb 10, 10, 6
+; BE-NEXT:    vor 18, 12, 18
+; BE-NEXT:    lvx 12, 0, 3
 ; BE-NEXT:    addis 3, 2, .LCPI5_3 at toc@ha
 ; BE-NEXT:    addi 3, 3, .LCPI5_3 at toc@l
-; BE-NEXT:    lvx 30, 0, 3
+; BE-NEXT:    vslb 17, 17, 6
+; BE-NEXT:    vand 10, 10, 1
+; BE-NEXT:    vor 17, 10, 17
+; BE-NEXT:    vspltisb 10, 1
+; BE-NEXT:    vsrb 31, 18, 10
+; BE-NEXT:    vand 18, 18, 12
+; BE-NEXT:    vaddubm 18, 18, 18
+; BE-NEXT:    vand 31, 31, 12
+; BE-NEXT:    vor 18, 31, 18
+; BE-NEXT:    lvx 31, 0, 3
 ; BE-NEXT:    addis 3, 2, .LCPI5_4 at toc@ha
 ; BE-NEXT:    addi 3, 3, .LCPI5_4 at toc@l
-; BE-NEXT:    vand 16, 14, 6
-; BE-NEXT:    lvx 29, 0, 3
+; BE-NEXT:    lvx 30, 0, 3
 ; BE-NEXT:    addis 3, 2, .LCPI5_5 at toc@ha
 ; BE-NEXT:    addi 3, 3, .LCPI5_5 at toc@l
-; BE-NEXT:    lvx 28, 0, 3
+; BE-NEXT:    lvx 29, 0, 3
 ; BE-NEXT:    addis 3, 2, .LCPI5_6 at toc@ha
 ; BE-NEXT:    addi 3, 3, .LCPI5_6 at toc@l
+; BE-NEXT:    vsrb 19, 17, 10
+; BE-NEXT:    lvx 28, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI5_7 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_7 at toc@l
 ; BE-NEXT:    lvx 27, 0, 3
 ; BE-NEXT:    li 3, -16
-; BE-NEXT:    vsrh 14, 14, 0
-; BE-NEXT:    vslh 16, 16, 0
-; BE-NEXT:    vand 14, 14, 6
-; BE-NEXT:    vor 14, 14, 16
-; BE-NEXT:    vsrh 17, 14, 3
-; BE-NEXT:    vand 14, 14, 8
-; BE-NEXT:    vadduhm 14, 14, 14
-; BE-NEXT:    vsrh 16, 15, 3
-; BE-NEXT:    vand 15, 15, 8
+; BE-NEXT:    vand 17, 17, 12
+; BE-NEXT:    vaddubm 17, 17, 17
+; BE-NEXT:    vand 19, 19, 12
+; BE-NEXT:    vsldoi 9, 14, 14, 1
+; BE-NEXT:    vsldoi 11, 4, 4, 1
+; BE-NEXT:    vslh 13, 15, 15
+; BE-NEXT:    vor 17, 19, 17
+; BE-NEXT:    vand 19, 18, 15
 ; BE-NEXT:    vadduhm 15, 15, 15
-; BE-NEXT:    vand 17, 17, 8
-; BE-NEXT:    vand 16, 16, 8
-; BE-NEXT:    vor 14, 17, 14
-; BE-NEXT:    vslh 7, 2, 2
-; BE-NEXT:    vsldoi 9, 3, 3, 1
-; BE-NEXT:    vsldoi 10, 0, 0, 1
-; BE-NEXT:    vsldoi 11, 2, 2, 1
-; BE-NEXT:    vslh 12, 4, 4
-; BE-NEXT:    vor 15, 16, 15
-; BE-NEXT:    vand 16, 14, 0
-; BE-NEXT:    vand 17, 14, 3
-; BE-NEXT:    vand 18, 14, 2
-; BE-NEXT:    vand 19, 14, 19
-; BE-NEXT:    vand 31, 14, 31
-; BE-NEXT:    vand 7, 14, 7
-; BE-NEXT:    vand 30, 14, 30
-; BE-NEXT:    vand 9, 14, 9
-; BE-NEXT:    vand 10, 14, 10
-; BE-NEXT:    vand 11, 14, 11
-; BE-NEXT:    vand 12, 14, 12
-; BE-NEXT:    vand 29, 14, 29
-; BE-NEXT:    vand 28, 14, 28
-; BE-NEXT:    vand 27, 14, 27
-; BE-NEXT:    vand 13, 14, 13
-; BE-NEXT:    vand 14, 14, 4
-; BE-NEXT:    vmladduhm 16, 15, 16, 5
-; BE-NEXT:    vmladduhm 17, 15, 17, 5
-; BE-NEXT:    vmladduhm 18, 15, 18, 5
-; BE-NEXT:    vmladduhm 14, 15, 14, 5
-; BE-NEXT:    vmladduhm 19, 15, 19, 5
-; BE-NEXT:    vmladduhm 31, 15, 31, 5
-; BE-NEXT:    vmladduhm 7, 15, 7, 5
-; BE-NEXT:    vmladduhm 30, 15, 30, 5
-; BE-NEXT:    vmladduhm 9, 15, 9, 5
-; BE-NEXT:    vmladduhm 10, 15, 10, 5
-; BE-NEXT:    vmladduhm 11, 15, 11, 5
-; BE-NEXT:    vmladduhm 12, 15, 12, 5
-; BE-NEXT:    vmladduhm 29, 15, 29, 5
-; BE-NEXT:    vmladduhm 28, 15, 28, 5
-; BE-NEXT:    vmladduhm 27, 15, 27, 5
-; BE-NEXT:    vmladduhm 5, 15, 13, 5
-; BE-NEXT:    vxor 13, 17, 16
-; BE-NEXT:    vxor 13, 13, 18
-; BE-NEXT:    vxor 13, 13, 14
-; BE-NEXT:    vxor 13, 13, 19
-; BE-NEXT:    vxor 13, 13, 31
+; BE-NEXT:    vand 14, 18, 14
+; BE-NEXT:    vand 8, 18, 8
+; BE-NEXT:    vand 4, 18, 4
+; BE-NEXT:    vand 15, 18, 15
+; BE-NEXT:    vand 31, 18, 31
+; BE-NEXT:    vand 0, 18, 0
+; BE-NEXT:    vand 30, 18, 30
+; BE-NEXT:    vand 7, 18, 7
+; BE-NEXT:    vand 9, 18, 9
+; BE-NEXT:    vand 11, 18, 11
+; BE-NEXT:    vand 13, 18, 13
+; BE-NEXT:    vand 29, 18, 29
+; BE-NEXT:    vand 28, 18, 28
+; BE-NEXT:    vand 27, 18, 27
+; BE-NEXT:    vand 16, 18, 16
+; BE-NEXT:    vxor 18, 18, 18
+; BE-NEXT:    vmladduhm 14, 17, 14, 18
+; BE-NEXT:    vmladduhm 8, 17, 8, 18
+; BE-NEXT:    vmladduhm 4, 17, 4, 18
+; BE-NEXT:    vxor 8, 8, 14
+; BE-NEXT:    vmladduhm 19, 17, 19, 18
+; BE-NEXT:    vxor 4, 8, 4
+; BE-NEXT:    vmladduhm 15, 17, 15, 18
+; BE-NEXT:    vxor 4, 4, 19
+; BE-NEXT:    vmladduhm 31, 17, 31, 18
+; BE-NEXT:    vxor 4, 4, 15
+; BE-NEXT:    vmladduhm 0, 17, 0, 18
+; BE-NEXT:    vxor 4, 4, 31
 ; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, -32
-; BE-NEXT:    vxor 7, 13, 7
-; BE-NEXT:    vxor 7, 7, 30
+; BE-NEXT:    vmladduhm 30, 17, 30, 18
+; BE-NEXT:    vxor 4, 4, 0
+; BE-NEXT:    vmladduhm 7, 17, 7, 18
+; BE-NEXT:    vxor 4, 4, 30
 ; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, -48
-; BE-NEXT:    vxor 7, 7, 9
-; BE-NEXT:    vxor 7, 7, 10
-; BE-NEXT:    vxor 7, 7, 11
-; BE-NEXT:    vxor 7, 7, 12
-; BE-NEXT:    vxor 7, 7, 29
+; BE-NEXT:    vmladduhm 9, 17, 9, 18
+; BE-NEXT:    vxor 4, 4, 7
+; BE-NEXT:    vmladduhm 11, 17, 11, 18
+; BE-NEXT:    vxor 4, 4, 9
+; BE-NEXT:    vmladduhm 13, 17, 13, 18
+; BE-NEXT:    vxor 4, 4, 11
+; BE-NEXT:    vmladduhm 29, 17, 29, 18
+; BE-NEXT:    vxor 4, 4, 13
+; BE-NEXT:    vmladduhm 28, 17, 28, 18
+; BE-NEXT:    vxor 4, 4, 29
 ; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, -64
-; BE-NEXT:    vxor 7, 7, 28
+; BE-NEXT:    vmladduhm 27, 17, 27, 18
+; BE-NEXT:    vxor 4, 4, 28
 ; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, -80
-; BE-NEXT:    vxor 7, 7, 27
+; BE-NEXT:    vmladduhm 16, 17, 16, 18
+; BE-NEXT:    vxor 4, 4, 27
 ; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    vxor 5, 7, 5
-; BE-NEXT:    vrlh 4, 5, 4
-; BE-NEXT:    vand 5, 4, 1
-; BE-NEXT:    vsrh 4, 4, 2
-; BE-NEXT:    vslh 2, 5, 2
-; BE-NEXT:    vand 4, 4, 1
-; BE-NEXT:    vor 2, 4, 2
-; BE-NEXT:    vand 4, 2, 6
-; BE-NEXT:    vsrh 2, 2, 0
-; BE-NEXT:    vslh 4, 4, 0
-; BE-NEXT:    vand 2, 2, 6
-; BE-NEXT:    vor 2, 2, 4
-; BE-NEXT:    vsrh 3, 2, 3
-; BE-NEXT:    vand 2, 2, 8
-; BE-NEXT:    vadduhm 2, 2, 2
-; BE-NEXT:    vand 3, 3, 8
+; BE-NEXT:    vxor 4, 4, 16
+; BE-NEXT:    vperm 4, 4, 4, 5
+; BE-NEXT:    vand 3, 4, 3
+; BE-NEXT:    vsrb 5, 4, 2
+; BE-NEXT:    vslb 2, 3, 2
+; BE-NEXT:    vor 2, 5, 2
+; BE-NEXT:    vand 3, 2, 1
+; BE-NEXT:    vsrb 2, 2, 6
+; BE-NEXT:    vslb 3, 3, 6
+; BE-NEXT:    vand 2, 2, 1
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    vsrb 3, 2, 10
+; BE-NEXT:    vand 2, 2, 12
+; BE-NEXT:    vaddubm 2, 2, 2
+; BE-NEXT:    vand 3, 3, 12
 ; BE-NEXT:    vor 2, 3, 2
 ; BE-NEXT:    blr
 ;
 ; LE-LABEL: clmulr_v8i16:
 ; LE:       # %bb.0:
-; LE-NEXT:    vspltish 5, 8
-; LE-NEXT:    vspltisb 4, 15
 ; LE-NEXT:    addis 3, 2, .LCPI5_0 at toc@ha
-; LE-NEXT:    vrlh 2, 2, 5
-; LE-NEXT:    vspltish 0, 4
+; LE-NEXT:    vspltisb 5, 4
+; LE-NEXT:    vspltish 7, 2
 ; LE-NEXT:    addi 3, 3, .LCPI5_0 at toc@l
-; LE-NEXT:    vspltish 1, 2
-; LE-NEXT:    vspltish 6, 1
-; LE-NEXT:    vrlh 3, 3, 5
-; LE-NEXT:    xxland 42, 34, 36
-; LE-NEXT:    vsrh 2, 2, 0
-; LE-NEXT:    vslh 10, 10, 0
-; LE-NEXT:    xxland 0, 34, 36
-; LE-NEXT:    vsldoi 7, 6, 6, 1
-; LE-NEXT:    vsldoi 8, 1, 1, 1
-; LE-NEXT:    vsldoi 9, 0, 0, 1
-; LE-NEXT:    xxlor 34, 0, 42
+; LE-NEXT:    vspltish 8, 1
+; LE-NEXT:    vspltish 1, 4
+; LE-NEXT:    vspltish 0, 8
 ; LE-NEXT:    lxvd2x 0, 0, 3
 ; LE-NEXT:    addis 3, 2, .LCPI5_1 at toc@ha
 ; LE-NEXT:    addi 3, 3, .LCPI5_1 at toc@l
-; LE-NEXT:    xxland 42, 34, 0
-; LE-NEXT:    vsrh 2, 2, 1
-; LE-NEXT:    vslh 10, 10, 1
-; LE-NEXT:    xxland 1, 34, 0
-; LE-NEXT:    xxlor 34, 1, 42
-; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    vsldoi 9, 8, 8, 1
+; LE-NEXT:    vsldoi 13, 1, 1, 1
+; LE-NEXT:    xxswapd 36, 0
+; LE-NEXT:    lxvd2x 0, 0, 3
 ; LE-NEXT:    addis 3, 2, .LCPI5_2 at toc@ha
-; LE-NEXT:    vsrh 10, 2, 6
 ; LE-NEXT:    addi 3, 3, .LCPI5_2 at toc@l
-; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    vperm 10, 2, 2, 4
+; LE-NEXT:    vperm 6, 3, 3, 4
+; LE-NEXT:    vspltisb 3, 2
+; LE-NEXT:    vspltisb 2, 1
+; LE-NEXT:    vslb 11, 10, 5
+; LE-NEXT:    vsrb 12, 10, 5
+; LE-NEXT:    xxlor 43, 44, 43
+; LE-NEXT:    xxland 44, 43, 0
+; LE-NEXT:    vsrb 11, 11, 3
+; LE-NEXT:    vsldoi 10, 7, 7, 1
+; LE-NEXT:    vslb 12, 12, 3
+; LE-NEXT:    xxland 1, 43, 0
+; LE-NEXT:    xxlor 43, 1, 44
+; LE-NEXT:    lxvd2x 1, 0, 3
 ; LE-NEXT:    addis 3, 2, .LCPI5_3 at toc@ha
-; LE-NEXT:    xxland 34, 34, 1
-; LE-NEXT:    xxland 2, 42, 1
-; LE-NEXT:    xxland 42, 35, 36
-; LE-NEXT:    vsrh 3, 3, 0
+; LE-NEXT:    vsrb 12, 11, 2
 ; LE-NEXT:    addi 3, 3, .LCPI5_3 at toc@l
-; LE-NEXT:    vadduhm 2, 2, 2
-; LE-NEXT:    vslh 10, 10, 0
-; LE-NEXT:    xxlor 34, 2, 34
-; LE-NEXT:    xxland 2, 35, 36
-; LE-NEXT:    xxlor 35, 2, 42
-; LE-NEXT:    xxland 42, 35, 0
-; LE-NEXT:    vsrh 3, 3, 1
-; LE-NEXT:    vslh 10, 10, 1
-; LE-NEXT:    xxland 2, 35, 0
-; LE-NEXT:    xxlor 35, 2, 42
-; LE-NEXT:    vsrh 10, 3, 6
-; LE-NEXT:    xxland 35, 35, 1
-; LE-NEXT:    xxland 2, 42, 1
-; LE-NEXT:    vadduhm 3, 3, 3
-; LE-NEXT:    xxlor 2, 2, 35
-; LE-NEXT:    vxor 3, 3, 3
-; LE-NEXT:    xxland 42, 2, 33
-; LE-NEXT:    xxland 43, 2, 38
-; LE-NEXT:    xxland 39, 2, 39
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    vmladduhm 11, 2, 11, 3
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 43, 42
-; LE-NEXT:    xxland 42, 2, 32
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxland 42, 2, 37
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    vadduhm 10, 5, 5
-; LE-NEXT:    xxland 42, 2, 42
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxland 42, 2, 4
 ; LE-NEXT:    lxvd2x 4, 0, 3
 ; LE-NEXT:    addis 3, 2, .LCPI5_4 at toc@ha
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxland 2, 44, 1
+; LE-NEXT:    vslb 12, 6, 5
+; LE-NEXT:    vsrb 6, 6, 5
 ; LE-NEXT:    addi 3, 3, .LCPI5_4 at toc@l
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    vslh 10, 0, 0
-; LE-NEXT:    xxland 42, 2, 42
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxland 42, 2, 4
+; LE-NEXT:    xxlor 38, 38, 44
+; LE-NEXT:    xxland 44, 38, 0
+; LE-NEXT:    vsrb 6, 6, 3
+; LE-NEXT:    vslb 12, 12, 3
+; LE-NEXT:    xxland 3, 38, 0
+; LE-NEXT:    xxlor 44, 3, 44
+; LE-NEXT:    vsrb 6, 12, 2
+; LE-NEXT:    xxland 3, 38, 1
+; LE-NEXT:    xxland 38, 43, 1
+; LE-NEXT:    xxland 43, 44, 1
+; LE-NEXT:    vaddubm 6, 6, 6
+; LE-NEXT:    vaddubm 11, 11, 11
+; LE-NEXT:    xxlor 38, 2, 38
+; LE-NEXT:    xxlor 2, 3, 43
+; LE-NEXT:    xxland 43, 2, 39
+; LE-NEXT:    xxland 40, 2, 40
+; LE-NEXT:    vxor 7, 7, 7
+; LE-NEXT:    vmladduhm 11, 6, 11, 7
+; LE-NEXT:    vmladduhm 8, 6, 8, 7
+; LE-NEXT:    xxlxor 3, 40, 43
+; LE-NEXT:    xxland 40, 2, 33
+; LE-NEXT:    vslh 1, 1, 1
+; LE-NEXT:    vmladduhm 8, 6, 8, 7
+; LE-NEXT:    xxland 33, 2, 33
+; LE-NEXT:    vmladduhm 1, 6, 1, 7
+; LE-NEXT:    xxlxor 3, 3, 40
+; LE-NEXT:    xxland 40, 2, 32
+; LE-NEXT:    vmladduhm 8, 6, 8, 7
+; LE-NEXT:    xxlxor 3, 3, 40
+; LE-NEXT:    vadduhm 8, 0, 0
+; LE-NEXT:    vslh 0, 0, 0
+; LE-NEXT:    xxland 40, 2, 40
+; LE-NEXT:    xxland 32, 2, 32
+; LE-NEXT:    vmladduhm 8, 6, 8, 7
+; LE-NEXT:    vmladduhm 0, 6, 0, 7
+; LE-NEXT:    xxlxor 3, 3, 40
+; LE-NEXT:    xxland 40, 2, 4
 ; LE-NEXT:    lxvd2x 4, 0, 3
 ; LE-NEXT:    addis 3, 2, .LCPI5_5 at toc@ha
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    vmladduhm 8, 6, 8, 7
 ; LE-NEXT:    addi 3, 3, .LCPI5_5 at toc@l
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 40
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 41
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    vslh 7, 5, 5
-; LE-NEXT:    xxland 39, 2, 39
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    xxlxor 3, 3, 40
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxland 33, 2, 4
 ; LE-NEXT:    lxvd2x 4, 0, 3
 ; LE-NEXT:    addis 3, 2, .LCPI5_6 at toc@ha
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    vmladduhm 1, 6, 1, 7
 ; LE-NEXT:    addi 3, 3, .LCPI5_6 at toc@l
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxland 33, 2, 41
+; LE-NEXT:    vmladduhm 1, 6, 1, 7
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxland 33, 2, 42
+; LE-NEXT:    vmladduhm 1, 6, 1, 7
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxland 33, 2, 45
+; LE-NEXT:    vmladduhm 1, 6, 1, 7
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxlxor 3, 3, 32
+; LE-NEXT:    xxland 32, 2, 4
 ; LE-NEXT:    lxvd2x 4, 0, 3
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 4
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxleqv 39, 39, 39
-; LE-NEXT:    vslh 7, 7, 7
-; LE-NEXT:    xxland 39, 2, 39
-; LE-NEXT:    vmladduhm 2, 2, 7, 3
-; LE-NEXT:    xxlxor 34, 3, 34
-; LE-NEXT:    vrlh 2, 2, 5
-; LE-NEXT:    xxland 35, 34, 36
-; LE-NEXT:    vsrh 2, 2, 0
-; LE-NEXT:    vslh 3, 3, 0
-; LE-NEXT:    xxland 2, 34, 36
-; LE-NEXT:    xxlor 34, 2, 35
-; LE-NEXT:    xxland 35, 34, 0
-; LE-NEXT:    vsrh 2, 2, 1
-; LE-NEXT:    vslh 3, 3, 1
-; LE-NEXT:    xxland 0, 34, 0
-; LE-NEXT:    xxlor 34, 0, 35
-; LE-NEXT:    vsrh 3, 2, 6
-; LE-NEXT:    xxland 34, 34, 1
-; LE-NEXT:    xxland 0, 35, 1
-; LE-NEXT:    vadduhm 2, 2, 2
+; LE-NEXT:    addis 3, 2, .LCPI5_7 at toc@ha
+; LE-NEXT:    vmladduhm 0, 6, 0, 7
+; LE-NEXT:    addi 3, 3, .LCPI5_7 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 32
+; LE-NEXT:    xxland 32, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    vmladduhm 0, 6, 0, 7
+; LE-NEXT:    xxlxor 3, 3, 32
+; LE-NEXT:    xxland 32, 2, 4
+; LE-NEXT:    vmladduhm 0, 6, 0, 7
+; LE-NEXT:    xxlxor 3, 3, 32
+; LE-NEXT:    xxleqv 32, 32, 32
+; LE-NEXT:    vslh 0, 0, 0
+; LE-NEXT:    xxland 32, 2, 32
+; LE-NEXT:    vmladduhm 0, 6, 0, 7
+; LE-NEXT:    xxlxor 32, 3, 32
+; LE-NEXT:    vperm 4, 0, 0, 4
+; LE-NEXT:    vslb 0, 4, 5
+; LE-NEXT:    vsrb 4, 4, 5
+; LE-NEXT:    xxlor 36, 36, 32
+; LE-NEXT:    xxland 37, 36, 0
+; LE-NEXT:    vslb 5, 5, 3
+; LE-NEXT:    vsrb 3, 4, 3
+; LE-NEXT:    xxland 0, 35, 0
+; LE-NEXT:    xxlor 35, 0, 37
+; LE-NEXT:    vsrb 2, 3, 2
+; LE-NEXT:    xxland 0, 34, 1
+; LE-NEXT:    xxland 34, 35, 1
+; LE-NEXT:    vaddubm 2, 2, 2
 ; LE-NEXT:    xxlor 34, 0, 34
 ; LE-NEXT:    blr
   %a.ext = zext <8 x i16> %a to <8 x i32>
@@ -2863,1007 +2866,1379 @@ define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; BE-LABEL: clmulr_v4i32:
 ; BE:       # %bb.0:
-; BE-NEXT:    stdu 1, -1472(1)
-; BE-NEXT:    li 3, 1280
-; BE-NEXT:    vspltisb 12, -1
+; BE-NEXT:    stdu 1, -1440(1)
+; BE-NEXT:    li 3, 1248
+; BE-NEXT:    addi 4, 1, 1152
+; BE-NEXT:    vspltisw 1, 2
 ; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1296
-; BE-NEXT:    vslw 15, 12, 12
-; BE-NEXT:    vspltisw 12, 12
+; BE-NEXT:    vsldoi 10, 1, 1, 2
+; BE-NEXT:    li 3, 1264
+; BE-NEXT:    lis 5, -21846
+; BE-NEXT:    vspltisw 4, 1
 ; BE-NEXT:    stvx 21, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1312
-; BE-NEXT:    vadduwm 17, 12, 12
-; BE-NEXT:    vspltisw 18, 8
+; BE-NEXT:    li 3, 1280
+; BE-NEXT:    lis 9, -13108
+; BE-NEXT:    ori 7, 5, 43690
+; BE-NEXT:    ori 5, 9, 52428
+; BE-NEXT:    vspltisb 6, -1
 ; BE-NEXT:    stvx 22, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1328
-; BE-NEXT:    vsrw 6, 2, 18
-; BE-NEXT:    vspltisw 19, 4
+; BE-NEXT:    li 3, 1296
+; BE-NEXT:    lis 6, 21845
+; BE-NEXT:    vsldoi 22, 1, 1, 3
+; BE-NEXT:    vspltisw 8, 8
 ; BE-NEXT:    stvx 23, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1344
+; BE-NEXT:    li 3, 1312
+; BE-NEXT:    lis 10, 13107
+; BE-NEXT:    ori 8, 6, 21845
+; BE-NEXT:    ori 6, 10, 13107
 ; BE-NEXT:    stvx 24, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1360
+; BE-NEXT:    li 3, 1328
+; BE-NEXT:    lis 11, 3855
 ; BE-NEXT:    stvx 25, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1376
-; BE-NEXT:    vsrw 9, 3, 18
+; BE-NEXT:    li 3, 1344
+; BE-NEXT:    vslw 5, 6, 6
 ; BE-NEXT:    stvx 26, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1392
+; BE-NEXT:    li 3, 1360
 ; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1408
+; BE-NEXT:    li 3, 1376
 ; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1424
-; BE-NEXT:    vsrw 12, 2, 17
+; BE-NEXT:    li 3, 1392
+; BE-NEXT:    vslw 7, 8, 8
 ; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1440
+; BE-NEXT:    li 3, 1408
 ; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1456
-; BE-NEXT:    vspltisw 30, 2
-; BE-NEXT:    vslw 14, 2, 17
+; BE-NEXT:    li 3, 1424
+; BE-NEXT:    vsldoi 19, 8, 8, 2
 ; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addi 3, 1, 1168
+; BE-NEXT:    stvx 3, 0, 3
+; BE-NEXT:    lis 3, -3856
+; BE-NEXT:    ori 3, 3, 61680
+; BE-NEXT:    stvx 2, 0, 4
+; BE-NEXT:    ori 4, 11, 3855
+; BE-NEXT:    vspltisw 2, 4
+; BE-NEXT:    vsldoi 31, 1, 1, 1
+; BE-NEXT:    lwz 9, 1180(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    vslw 14, 2, 2
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    vsldoi 17, 2, 2, 1
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    vsldoi 11, 2, 2, 2
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1212(1)
+; BE-NEXT:    lwz 9, 1176(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    vsldoi 21, 2, 2, 3
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    vsldoi 20, 8, 8, 3
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    vsldoi 0, 4, 4, 1
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1208(1)
+; BE-NEXT:    lwz 9, 1172(1)
+; BE-NEXT:    vsldoi 27, 4, 4, 3
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    vsldoi 9, 4, 4, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1204(1)
+; BE-NEXT:    lwz 9, 1168(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1200(1)
+; BE-NEXT:    lwz 9, 1164(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1196(1)
+; BE-NEXT:    lwz 9, 1160(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1192(1)
+; BE-NEXT:    lwz 9, 1156(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1188(1)
+; BE-NEXT:    lwz 9, 1152(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1184(1)
+; BE-NEXT:    addi 9, 1, 1200
+; BE-NEXT:    lvx 18, 0, 9
+; BE-NEXT:    li 9, 864
+; BE-NEXT:    vand 12, 18, 1
+; BE-NEXT:    stvx 12, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 832
+; BE-NEXT:    vand 1, 18, 4
+; BE-NEXT:    stvx 1, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 800
+; BE-NEXT:    vand 6, 18, 2
+; BE-NEXT:    vadduwm 2, 8, 8
+; BE-NEXT:    stvx 6, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI6_0 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_0 at toc@l
+; BE-NEXT:    vand 24, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI6_1 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_1 at toc@l
+; BE-NEXT:    vand 25, 18, 8
+; BE-NEXT:    vand 8, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI6_2 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_2 at toc@l
+; BE-NEXT:    vand 15, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI6_3 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_3 at toc@l
+; BE-NEXT:    vand 30, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI6_4 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_4 at toc@l
+; BE-NEXT:    vand 23, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI6_5 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_5 at toc@l
+; BE-NEXT:    vand 13, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 768
+; BE-NEXT:    vand 26, 18, 14
+; BE-NEXT:    vand 14, 18, 7
+; BE-NEXT:    vand 7, 18, 2
+; BE-NEXT:    stvx 7, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1088
+; BE-NEXT:    vand 2, 18, 10
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI6_6 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_6 at toc@l
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI6_7 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_7 at toc@l
+; BE-NEXT:    vand 3, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 1056
+; BE-NEXT:    vand 2, 18, 2
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI6_8 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_8 at toc@l
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI6_9 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_9 at toc@l
+; BE-NEXT:    vand 28, 18, 0
+; BE-NEXT:    vand 0, 18, 2
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 896
+; BE-NEXT:    vand 2, 18, 2
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 928
+; BE-NEXT:    vand 2, 18, 27
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 960
+; BE-NEXT:    vand 2, 18, 22
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 992
+; BE-NEXT:    vand 2, 18, 21
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1024
+; BE-NEXT:    vand 2, 18, 20
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI6_10 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_10 at toc@l
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 304
+; BE-NEXT:    vand 2, 18, 2
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI6_11 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_11 at toc@l
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 224
+; BE-NEXT:    vand 2, 18, 2
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI6_12 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI6_12 at toc@l
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 128
+; BE-NEXT:    vand 2, 18, 2
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 208
+; BE-NEXT:    vand 2, 18, 5
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addi 9, 1, 1184
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    vand 29, 18, 31
+; BE-NEXT:    vand 31, 18, 17
+; BE-NEXT:    vand 17, 18, 9
+; BE-NEXT:    vspltisw 9, -16
+; BE-NEXT:    vxor 4, 4, 4
+; BE-NEXT:    vrlw 5, 12, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1120
+; BE-NEXT:    vrlw 5, 1, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1104
+; BE-NEXT:    vrlw 5, 6, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1072
+; BE-NEXT:    vrlw 5, 25, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1040
+; BE-NEXT:    vrlw 5, 24, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1008
+; BE-NEXT:    vrlw 5, 8, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 976
+; BE-NEXT:    vrlw 5, 26, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 944
+; BE-NEXT:    vrlw 5, 15, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 912
+; BE-NEXT:    vrlw 5, 28, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 880
+; BE-NEXT:    vrlw 5, 29, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 848
+; BE-NEXT:    vrlw 5, 31, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 816
+; BE-NEXT:    vrlw 5, 14, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 784
+; BE-NEXT:    vrlw 5, 30, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 752
+; BE-NEXT:    vrlw 5, 23, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 720
+; BE-NEXT:    vrlw 5, 13, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 688
+; BE-NEXT:    vrlw 5, 7, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 656
+; BE-NEXT:    vrlw 5, 17, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1088
+; BE-NEXT:    lvx 16, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 608
+; BE-NEXT:    vrlw 5, 16, 9
+; BE-NEXT:    vand 11, 18, 11
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 576
+; BE-NEXT:    vrlw 5, 11, 9
+; BE-NEXT:    vand 10, 18, 19
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 544
+; BE-NEXT:    vrlw 5, 10, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 512
+; BE-NEXT:    vrlw 5, 3, 9
+; BE-NEXT:    vmr 20, 3
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1056
+; BE-NEXT:    vmr 18, 13
+; BE-NEXT:    lvx 13, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 480
+; BE-NEXT:    vrlw 5, 13, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 448
+; BE-NEXT:    vrlw 5, 0, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 896
+; BE-NEXT:    lvx 12, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 416
+; BE-NEXT:    vrlw 5, 12, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 928
+; BE-NEXT:    vmr 22, 25
+; BE-NEXT:    vmr 25, 15
+; BE-NEXT:    vmr 15, 11
+; BE-NEXT:    lvx 11, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 384
+; BE-NEXT:    vrlw 5, 11, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 960
+; BE-NEXT:    vmr 19, 14
+; BE-NEXT:    vmr 14, 10
+; BE-NEXT:    lvx 10, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 352
+; BE-NEXT:    vrlw 5, 10, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 992
+; BE-NEXT:    vmr 27, 8
+; BE-NEXT:    lvx 8, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 320
+; BE-NEXT:    vrlw 5, 8, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1024
+; BE-NEXT:    lvx 7, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 272
+; BE-NEXT:    vrlw 5, 7, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 304
+; BE-NEXT:    lvx 6, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 240
+; BE-NEXT:    vrlw 5, 6, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 224
+; BE-NEXT:    lvx 1, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 176
+; BE-NEXT:    vrlw 5, 1, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 128
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 144
+; BE-NEXT:    vrlw 5, 3, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 208
+; BE-NEXT:    lvx 5, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 64
+; BE-NEXT:    vmr 21, 0
+; BE-NEXT:    vrlw 0, 5, 9
+; BE-NEXT:    vmsumuhm 4, 2, 0, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 864
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 96
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 832
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 80
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 800
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 112
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 160
+; BE-NEXT:    vmulouh 4, 2, 22
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 192
+; BE-NEXT:    vmulouh 4, 2, 24
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 256
+; BE-NEXT:    vmulouh 4, 2, 27
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 288
+; BE-NEXT:    vmulouh 4, 2, 26
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 336
+; BE-NEXT:    vmulouh 4, 2, 25
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 368
+; BE-NEXT:    vmulouh 4, 2, 28
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 400
+; BE-NEXT:    vmulouh 4, 2, 29
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 432
+; BE-NEXT:    vmulouh 4, 2, 31
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 464
+; BE-NEXT:    vmulouh 4, 2, 19
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 496
+; BE-NEXT:    vmulouh 4, 2, 30
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 528
+; BE-NEXT:    vmulouh 4, 2, 23
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 560
+; BE-NEXT:    vmulouh 4, 2, 18
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 768
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 592
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 624
+; BE-NEXT:    vmulouh 4, 2, 17
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 640
+; BE-NEXT:    vmulouh 4, 2, 16
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 672
+; BE-NEXT:    vmulouh 4, 2, 15
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 704
+; BE-NEXT:    vmulouh 4, 2, 14
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 736
+; BE-NEXT:    vmulouh 4, 2, 20
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 768
+; BE-NEXT:    vmulouh 4, 2, 13
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 800
+; BE-NEXT:    vmulouh 4, 2, 21
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 832
+; BE-NEXT:    vmulouh 4, 2, 12
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 864
+; BE-NEXT:    vmulouh 4, 2, 11
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 896
+; BE-NEXT:    vmulouh 4, 2, 10
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 928
+; BE-NEXT:    vmulouh 4, 2, 8
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 960
+; BE-NEXT:    vmulouh 4, 2, 7
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 992
+; BE-NEXT:    vmulouh 4, 2, 6
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1024
+; BE-NEXT:    vmulouh 4, 2, 1
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1056
+; BE-NEXT:    vmulouh 3, 2, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1088
+; BE-NEXT:    vmulouh 2, 2, 5
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    lvx 2, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1120
+; BE-NEXT:    vslw 3, 2, 9
+; BE-NEXT:    lvx 2, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1104
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1072
+; BE-NEXT:    lvx 5, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1040
+; BE-NEXT:    vslw 2, 2, 9
+; BE-NEXT:    lvx 0, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1008
+; BE-NEXT:    lvx 1, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 976
+; BE-NEXT:    lvx 6, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 944
+; BE-NEXT:    vslw 4, 4, 9
+; BE-NEXT:    lvx 7, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 912
+; BE-NEXT:    lvx 8, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 880
+; BE-NEXT:    lvx 10, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 848
+; BE-NEXT:    vslw 5, 5, 9
+; BE-NEXT:    lvx 11, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 816
+; BE-NEXT:    lvx 12, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 784
+; BE-NEXT:    vslw 0, 0, 9
+; BE-NEXT:    lvx 13, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 752
+; BE-NEXT:    lvx 14, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 720
+; BE-NEXT:    lvx 15, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 688
+; BE-NEXT:    vslw 1, 1, 9
+; BE-NEXT:    lvx 16, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 656
+; BE-NEXT:    lvx 17, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 608
+; BE-NEXT:    lvx 18, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 576
+; BE-NEXT:    vslw 6, 6, 9
+; BE-NEXT:    lvx 19, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 544
+; BE-NEXT:    lvx 31, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 512
+; BE-NEXT:    vslw 7, 7, 9
+; BE-NEXT:    lvx 30, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 480
+; BE-NEXT:    lvx 29, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 448
+; BE-NEXT:    lvx 28, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 416
+; BE-NEXT:    vslw 8, 8, 9
+; BE-NEXT:    lvx 27, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 384
+; BE-NEXT:    lvx 26, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 352
+; BE-NEXT:    lvx 25, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 320
+; BE-NEXT:    vslw 10, 10, 9
+; BE-NEXT:    lvx 24, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 272
+; BE-NEXT:    lvx 23, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 240
+; BE-NEXT:    vslw 11, 11, 9
+; BE-NEXT:    lvx 22, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 176
+; BE-NEXT:    lvx 21, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 144
+; BE-NEXT:    lvx 20, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    vslw 12, 12, 9
+; BE-NEXT:    vslw 20, 20, 9
+; BE-NEXT:    stvx 20, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 64
+; BE-NEXT:    lvx 20, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 96
+; BE-NEXT:    vslw 13, 13, 9
+; BE-NEXT:    vslw 14, 14, 9
+; BE-NEXT:    vslw 15, 15, 9
+; BE-NEXT:    vslw 16, 16, 9
+; BE-NEXT:    vslw 17, 17, 9
+; BE-NEXT:    vslw 18, 18, 9
+; BE-NEXT:    vslw 19, 19, 9
+; BE-NEXT:    vslw 31, 31, 9
+; BE-NEXT:    vslw 30, 30, 9
+; BE-NEXT:    vslw 29, 29, 9
+; BE-NEXT:    vslw 28, 28, 9
+; BE-NEXT:    vslw 27, 27, 9
+; BE-NEXT:    vslw 26, 26, 9
+; BE-NEXT:    vslw 25, 25, 9
+; BE-NEXT:    vslw 24, 24, 9
+; BE-NEXT:    vslw 23, 23, 9
+; BE-NEXT:    vslw 22, 22, 9
+; BE-NEXT:    vslw 21, 21, 9
+; BE-NEXT:    vslw 20, 20, 9
+; BE-NEXT:    lvx 9, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 80
+; BE-NEXT:    vadduwm 3, 9, 3
+; BE-NEXT:    lvx 9, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 112
+; BE-NEXT:    vadduwm 2, 9, 2
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 160
+; BE-NEXT:    vadduwm 3, 3, 4
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 192
+; BE-NEXT:    vadduwm 3, 3, 5
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 256
+; BE-NEXT:    vadduwm 3, 3, 0
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 288
+; BE-NEXT:    vadduwm 3, 3, 1
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 336
+; BE-NEXT:    vadduwm 3, 3, 6
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 368
+; BE-NEXT:    vadduwm 3, 3, 7
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 400
+; BE-NEXT:    vadduwm 3, 3, 8
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 432
+; BE-NEXT:    vadduwm 3, 3, 10
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 464
+; BE-NEXT:    vadduwm 3, 3, 11
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 496
+; BE-NEXT:    vadduwm 3, 3, 12
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 528
+; BE-NEXT:    vadduwm 3, 3, 13
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 560
+; BE-NEXT:    vadduwm 3, 3, 14
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 592
+; BE-NEXT:    vadduwm 3, 3, 15
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 624
+; BE-NEXT:    vadduwm 3, 3, 16
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 640
+; BE-NEXT:    vadduwm 3, 3, 17
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 672
+; BE-NEXT:    vadduwm 3, 3, 18
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 704
+; BE-NEXT:    vadduwm 3, 3, 19
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 736
+; BE-NEXT:    vadduwm 3, 3, 31
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 768
+; BE-NEXT:    vadduwm 3, 3, 30
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 800
+; BE-NEXT:    vadduwm 3, 3, 29
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 832
+; BE-NEXT:    vadduwm 3, 3, 28
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 864
+; BE-NEXT:    vadduwm 3, 3, 27
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 896
+; BE-NEXT:    vadduwm 3, 3, 26
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 928
+; BE-NEXT:    vadduwm 3, 3, 25
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 960
+; BE-NEXT:    vadduwm 3, 3, 24
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 992
+; BE-NEXT:    vadduwm 3, 3, 23
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1024
+; BE-NEXT:    vadduwm 3, 3, 22
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1056
+; BE-NEXT:    vadduwm 3, 3, 21
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1088
+; BE-NEXT:    vadduwm 3, 3, 4
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    addi 9, 1, 1216
+; BE-NEXT:    vadduwm 3, 3, 20
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    stvx 2, 0, 9
+; BE-NEXT:    lwz 9, 1228(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1244(1)
+; BE-NEXT:    lwz 9, 1224(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1240(1)
+; BE-NEXT:    lwz 9, 1220(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1236(1)
+; BE-NEXT:    lwz 9, 1216(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 7, 10, 7
+; BE-NEXT:    and 8, 9, 8
+; BE-NEXT:    or 7, 8, 7
+; BE-NEXT:    slwi 8, 7, 2
+; BE-NEXT:    srwi 7, 7, 2
+; BE-NEXT:    and 5, 8, 5
+; BE-NEXT:    and 6, 7, 6
+; BE-NEXT:    or 5, 6, 5
+; BE-NEXT:    slwi 6, 5, 4
+; BE-NEXT:    srwi 5, 5, 4
+; BE-NEXT:    and 3, 6, 3
+; BE-NEXT:    and 4, 5, 4
+; BE-NEXT:    or 3, 4, 3
+; BE-NEXT:    rotlwi 4, 3, 24
+; BE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 4, 0, 32
+; BE-NEXT:    stw 3, 1232(1)
+; BE-NEXT:    addi 3, 1, 1232
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 1424
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1408
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1392
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1376
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1360
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1344
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1328
+; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1312
+; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1296
+; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1280
+; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1264
-; BE-NEXT:    vspltisw 31, 1
-; BE-NEXT:    stvx 17, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI6_0 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_0 at toc@l
-; BE-NEXT:    lvx 29, 0, 3
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1248
-; BE-NEXT:    vsrw 16, 3, 17
-; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1232
-; BE-NEXT:    vslw 17, 3, 17
-; BE-NEXT:    vand 2, 2, 29
-; BE-NEXT:    vand 3, 3, 29
-; BE-NEXT:    vand 6, 6, 29
-; BE-NEXT:    vand 9, 9, 29
-; BE-NEXT:    vslw 2, 2, 18
-; BE-NEXT:    vslw 3, 3, 18
-; BE-NEXT:    vor 6, 6, 12
-; BE-NEXT:    vspltisb 12, 15
-; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI6_1 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_1 at toc@l
-; BE-NEXT:    vor 9, 9, 16
-; BE-NEXT:    vor 2, 14, 2
-; BE-NEXT:    vor 3, 17, 3
-; BE-NEXT:    vor 2, 2, 6
-; BE-NEXT:    vor 3, 3, 9
-; BE-NEXT:    vand 6, 2, 12
-; BE-NEXT:    vsrw 2, 2, 19
-; BE-NEXT:    vand 9, 3, 12
-; BE-NEXT:    vsrw 3, 3, 19
-; BE-NEXT:    vand 2, 2, 12
-; BE-NEXT:    vand 3, 3, 12
-; BE-NEXT:    lvx 12, 0, 3
-; BE-NEXT:    li 3, 1216
-; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI6_2 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_2 at toc@l
-; BE-NEXT:    vslw 6, 6, 19
-; BE-NEXT:    vslw 9, 9, 19
-; BE-NEXT:    vor 2, 2, 6
-; BE-NEXT:    vor 3, 3, 9
-; BE-NEXT:    vand 6, 2, 12
-; BE-NEXT:    vsrw 2, 2, 30
-; BE-NEXT:    vand 9, 3, 12
-; BE-NEXT:    vsrw 3, 3, 30
-; BE-NEXT:    vand 2, 2, 12
-; BE-NEXT:    vand 3, 3, 12
-; BE-NEXT:    lvx 12, 0, 3
-; BE-NEXT:    li 3, 1200
-; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1136
-; BE-NEXT:    stvx 18, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI6_3 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_3 at toc@l
-; BE-NEXT:    vslw 6, 6, 30
-; BE-NEXT:    vslw 9, 9, 30
-; BE-NEXT:    vor 2, 2, 6
-; BE-NEXT:    vor 3, 3, 9
-; BE-NEXT:    vsrw 6, 2, 31
-; BE-NEXT:    vand 2, 2, 12
-; BE-NEXT:    vadduwm 2, 2, 2
-; BE-NEXT:    vsrw 9, 3, 31
-; BE-NEXT:    vand 3, 3, 12
-; BE-NEXT:    vand 6, 6, 12
-; BE-NEXT:    vand 12, 9, 12
-; BE-NEXT:    vor 9, 6, 2
-; BE-NEXT:    vadduwm 2, 3, 3
-; BE-NEXT:    vor 14, 12, 2
-; BE-NEXT:    vadduwm 2, 18, 18
-; BE-NEXT:    vand 28, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_4 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_4 at toc@l
-; BE-NEXT:    vand 27, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_5 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_5 at toc@l
-; BE-NEXT:    vand 25, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_6 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_6 at toc@l
-; BE-NEXT:    vslw 4, 19, 19
-; BE-NEXT:    vand 26, 14, 4
-; BE-NEXT:    vand 4, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_7 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_7 at toc@l
-; BE-NEXT:    vsldoi 5, 31, 31, 1
-; BE-NEXT:    vand 24, 14, 5
-; BE-NEXT:    vand 5, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_8 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_8 at toc@l
-; BE-NEXT:    vand 29, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_9 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_9 at toc@l
-; BE-NEXT:    vand 21, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_10 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_10 at toc@l
-; BE-NEXT:    vslw 7, 18, 18
-; BE-NEXT:    vand 3, 14, 7
-; BE-NEXT:    vand 7, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_11 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_11 at toc@l
-; BE-NEXT:    vsldoi 13, 18, 18, 2
-; BE-NEXT:    vand 16, 14, 13
-; BE-NEXT:    vand 13, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI6_12 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_12 at toc@l
-; BE-NEXT:    vand 12, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 1184
-; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1168
-; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1152
-; BE-NEXT:    vsldoi 11, 31, 31, 2
-; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1072
-; BE-NEXT:    vsldoi 1, 19, 19, 1
-; BE-NEXT:    vsldoi 10, 30, 30, 2
-; BE-NEXT:    vand 20, 14, 11
-; BE-NEXT:    vand 11, 14, 2
-; BE-NEXT:    vsldoi 2, 31, 31, 3
-; BE-NEXT:    vsldoi 8, 19, 19, 2
-; BE-NEXT:    vand 22, 14, 1
-; BE-NEXT:    vand 1, 14, 10
-; BE-NEXT:    vand 10, 14, 2
-; BE-NEXT:    vsldoi 2, 30, 30, 3
-; BE-NEXT:    vand 17, 14, 8
-; BE-NEXT:    vand 8, 14, 2
-; BE-NEXT:    vsldoi 2, 19, 19, 3
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1040
-; BE-NEXT:    vsldoi 2, 18, 18, 3
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI6_13 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_13 at toc@l
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 1008
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI6_14 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_14 at toc@l
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 288
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI6_15 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI6_15 at toc@l
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 192
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 272
-; BE-NEXT:    vand 2, 14, 15
-; BE-NEXT:    vspltisw 15, -16
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 976
-; BE-NEXT:    vand 2, 14, 30
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 944
-; BE-NEXT:    vand 31, 14, 31
-; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 912
-; BE-NEXT:    vsldoi 0, 30, 30, 1
-; BE-NEXT:    vand 19, 14, 19
-; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 880
-; BE-NEXT:    vand 23, 14, 0
-; BE-NEXT:    vand 14, 14, 18
-; BE-NEXT:    stvx 14, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    vxor 6, 6, 6
-; BE-NEXT:    vrlw 0, 2, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1104
-; BE-NEXT:    vrlw 0, 31, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1088
-; BE-NEXT:    vrlw 0, 19, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1056
-; BE-NEXT:    vrlw 0, 14, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1024
-; BE-NEXT:    vrlw 0, 28, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 992
-; BE-NEXT:    vrlw 0, 27, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 960
-; BE-NEXT:    vrlw 0, 26, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 928
-; BE-NEXT:    vrlw 0, 25, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 896
-; BE-NEXT:    vrlw 0, 24, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 864
-; BE-NEXT:    vrlw 0, 23, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 832
-; BE-NEXT:    vrlw 0, 22, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 800
-; BE-NEXT:    vrlw 0, 3, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 768
-; BE-NEXT:    vrlw 0, 4, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 736
-; BE-NEXT:    vrlw 0, 5, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 704
-; BE-NEXT:    vrlw 0, 29, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 672
-; BE-NEXT:    vrlw 0, 21, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 640
-; BE-NEXT:    vrlw 0, 20, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 592
-; BE-NEXT:    vrlw 0, 1, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 560
-; BE-NEXT:    vrlw 0, 17, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 528
-; BE-NEXT:    vrlw 0, 16, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 496
-; BE-NEXT:    vrlw 0, 7, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 464
-; BE-NEXT:    vrlw 0, 13, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 432
-; BE-NEXT:    vrlw 0, 12, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 400
-; BE-NEXT:    vrlw 0, 11, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 368
-; BE-NEXT:    vrlw 0, 10, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 336
-; BE-NEXT:    vrlw 0, 8, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1072
-; BE-NEXT:    vmr 14, 7
-; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 304
-; BE-NEXT:    vrlw 0, 7, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1040
-; BE-NEXT:    vmr 30, 1
-; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 240
-; BE-NEXT:    vrlw 0, 1, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1008
-; BE-NEXT:    vmr 19, 5
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 208
-; BE-NEXT:    vrlw 0, 5, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 288
-; BE-NEXT:    vmr 18, 4
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 160
-; BE-NEXT:    vrlw 0, 4, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 192
-; BE-NEXT:    vmr 31, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 128
-; BE-NEXT:    vrlw 0, 3, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 272
-; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 64
-; BE-NEXT:    vrlw 0, 2, 15
-; BE-NEXT:    vmsumuhm 0, 9, 0, 6
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 976
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 96
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 944
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 80
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 912
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 112
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 880
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 144
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 176
-; BE-NEXT:    vmulouh 0, 9, 28
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 224
-; BE-NEXT:    vmulouh 0, 9, 27
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 256
-; BE-NEXT:    vmulouh 0, 9, 26
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 320
-; BE-NEXT:    vmulouh 0, 9, 25
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 352
-; BE-NEXT:    vmulouh 0, 9, 24
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 384
-; BE-NEXT:    vmulouh 0, 9, 23
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 416
-; BE-NEXT:    vmulouh 0, 9, 22
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 448
-; BE-NEXT:    vmulouh 0, 9, 31
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 480
-; BE-NEXT:    vmulouh 0, 9, 18
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 512
-; BE-NEXT:    vmulouh 0, 9, 19
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 544
-; BE-NEXT:    vmulouh 0, 9, 29
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 576
-; BE-NEXT:    vmulouh 0, 9, 21
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 608
-; BE-NEXT:    vmulouh 0, 9, 20
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 624
-; BE-NEXT:    vmulouh 0, 9, 30
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 656
-; BE-NEXT:    vmulouh 0, 9, 17
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 688
-; BE-NEXT:    vmulouh 0, 9, 16
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 720
-; BE-NEXT:    vmulouh 0, 9, 14
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 752
-; BE-NEXT:    vmulouh 0, 9, 13
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 784
-; BE-NEXT:    vmulouh 0, 9, 12
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 816
-; BE-NEXT:    vmulouh 0, 9, 11
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 848
-; BE-NEXT:    vmulouh 0, 9, 10
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 880
-; BE-NEXT:    vmulouh 0, 9, 8
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 912
-; BE-NEXT:    vmulouh 0, 9, 7
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 944
-; BE-NEXT:    vmulouh 0, 9, 1
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 976
-; BE-NEXT:    vmulouh 5, 9, 5
-; BE-NEXT:    stvx 5, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1008
-; BE-NEXT:    vmulouh 4, 9, 4
-; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1040
-; BE-NEXT:    vmulouh 3, 9, 3
-; BE-NEXT:    stvx 3, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1072
-; BE-NEXT:    vmulouh 2, 9, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1104
-; BE-NEXT:    vslw 9, 2, 15
-; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1088
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1056
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1024
-; BE-NEXT:    vslw 2, 2, 15
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 992
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 960
-; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 928
-; BE-NEXT:    vslw 3, 3, 15
-; BE-NEXT:    lvx 6, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 896
-; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 864
-; BE-NEXT:    lvx 8, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 832
-; BE-NEXT:    vslw 4, 4, 15
-; BE-NEXT:    lvx 10, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 800
-; BE-NEXT:    lvx 11, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 768
-; BE-NEXT:    vslw 5, 5, 15
-; BE-NEXT:    lvx 12, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 736
-; BE-NEXT:    lvx 13, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 704
-; BE-NEXT:    lvx 14, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 672
-; BE-NEXT:    vslw 0, 0, 15
-; BE-NEXT:    lvx 16, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 640
-; BE-NEXT:    lvx 17, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 592
-; BE-NEXT:    lvx 18, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 560
-; BE-NEXT:    vslw 1, 1, 15
-; BE-NEXT:    lvx 19, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 528
-; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 496
-; BE-NEXT:    vslw 6, 6, 15
-; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 464
-; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 432
-; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 400
-; BE-NEXT:    vslw 7, 7, 15
-; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 368
-; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 336
-; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 304
-; BE-NEXT:    vslw 8, 8, 15
-; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 240
-; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 208
-; BE-NEXT:    vslw 10, 10, 15
-; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 160
-; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 128
-; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    vslw 11, 11, 15
-; BE-NEXT:    vslw 20, 20, 15
-; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 64
 ; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 96
-; BE-NEXT:    vslw 12, 12, 15
-; BE-NEXT:    vslw 13, 13, 15
-; BE-NEXT:    vslw 14, 14, 15
-; BE-NEXT:    vslw 16, 16, 15
-; BE-NEXT:    vslw 17, 17, 15
-; BE-NEXT:    vslw 18, 18, 15
-; BE-NEXT:    vslw 19, 19, 15
-; BE-NEXT:    vslw 31, 31, 15
-; BE-NEXT:    vslw 30, 30, 15
-; BE-NEXT:    vslw 29, 29, 15
-; BE-NEXT:    vslw 28, 28, 15
-; BE-NEXT:    vslw 27, 27, 15
-; BE-NEXT:    vslw 26, 26, 15
-; BE-NEXT:    vslw 25, 25, 15
-; BE-NEXT:    vslw 24, 24, 15
-; BE-NEXT:    vslw 23, 23, 15
-; BE-NEXT:    vslw 22, 22, 15
-; BE-NEXT:    vslw 21, 21, 15
-; BE-NEXT:    vslw 20, 20, 15
-; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 80
-; BE-NEXT:    vadduwm 9, 15, 9
-; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 112
-; BE-NEXT:    vadduwm 2, 15, 2
-; BE-NEXT:    vxor 2, 2, 9
-; BE-NEXT:    lvx 9, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 144
-; BE-NEXT:    vadduwm 3, 9, 3
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 176
-; BE-NEXT:    vadduwm 3, 3, 4
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 224
-; BE-NEXT:    vadduwm 3, 3, 5
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 256
-; BE-NEXT:    vadduwm 3, 3, 0
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 320
-; BE-NEXT:    vadduwm 3, 3, 1
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 352
-; BE-NEXT:    vadduwm 3, 3, 6
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 384
-; BE-NEXT:    vadduwm 3, 3, 7
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 416
-; BE-NEXT:    vadduwm 3, 3, 8
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 448
-; BE-NEXT:    vadduwm 3, 3, 10
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 480
-; BE-NEXT:    vadduwm 3, 3, 11
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 512
-; BE-NEXT:    vadduwm 3, 3, 12
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 544
-; BE-NEXT:    vadduwm 3, 3, 13
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 576
-; BE-NEXT:    vadduwm 3, 3, 14
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 608
-; BE-NEXT:    vadduwm 3, 3, 16
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 624
-; BE-NEXT:    vadduwm 3, 3, 17
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 656
-; BE-NEXT:    vadduwm 3, 3, 18
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 688
-; BE-NEXT:    vadduwm 3, 3, 19
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 720
-; BE-NEXT:    vadduwm 3, 3, 31
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 752
-; BE-NEXT:    vadduwm 3, 3, 30
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 784
-; BE-NEXT:    vadduwm 3, 3, 29
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 816
-; BE-NEXT:    vadduwm 3, 3, 28
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 848
-; BE-NEXT:    vadduwm 3, 3, 27
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 880
-; BE-NEXT:    vadduwm 3, 3, 26
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 912
-; BE-NEXT:    vadduwm 3, 3, 25
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 944
-; BE-NEXT:    vadduwm 3, 3, 24
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 976
-; BE-NEXT:    vadduwm 3, 3, 23
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1008
-; BE-NEXT:    vadduwm 3, 3, 22
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1040
-; BE-NEXT:    vadduwm 3, 3, 21
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1072
-; BE-NEXT:    vadduwm 3, 3, 4
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1264
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1136
-; BE-NEXT:    vadduwm 3, 3, 20
-; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1248
-; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1232
-; BE-NEXT:    vsrw 3, 2, 5
-; BE-NEXT:    vsrw 4, 2, 1
-; BE-NEXT:    vslw 5, 2, 5
-; BE-NEXT:    vand 2, 2, 0
-; BE-NEXT:    vslw 2, 2, 1
-; BE-NEXT:    vand 4, 4, 0
-; BE-NEXT:    vor 2, 5, 2
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1152
-; BE-NEXT:    vor 3, 4, 3
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1216
-; BE-NEXT:    vor 2, 2, 3
-; BE-NEXT:    vand 3, 2, 5
-; BE-NEXT:    vsrw 2, 2, 4
-; BE-NEXT:    vand 2, 2, 5
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1168
-; BE-NEXT:    vslw 3, 3, 4
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1184
-; BE-NEXT:    vor 2, 2, 3
-; BE-NEXT:    vand 3, 2, 5
-; BE-NEXT:    vsrw 2, 2, 4
-; BE-NEXT:    vslw 3, 3, 4
-; BE-NEXT:    vand 2, 2, 5
-; BE-NEXT:    vor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1200
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1456
-; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1440
-; BE-NEXT:    vsrw 3, 2, 3
-; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1424
-; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1408
-; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1392
-; BE-NEXT:    vand 2, 2, 4
-; BE-NEXT:    vadduwm 2, 2, 2
-; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1376
-; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1360
-; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1344
-; BE-NEXT:    vand 3, 3, 4
-; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1328
-; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1312
-; BE-NEXT:    vor 2, 3, 2
-; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1296
-; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1280
-; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    addi 1, 1, 1472
+; BE-NEXT:    addi 1, 1, 1440
 ; BE-NEXT:    blr
 ;
 ; LE-LABEL: clmulr_v4i32:
 ; LE:       # %bb.0:
-; LE-NEXT:    addis 3, 2, .LCPI6_0 at toc@ha
-; LE-NEXT:    vspltisw 7, 12
-; LE-NEXT:    vspltisw 4, 8
-; LE-NEXT:    addi 3, 3, .LCPI6_0 at toc@l
-; LE-NEXT:    vadduwm 7, 7, 7
-; LE-NEXT:    vsrw 17, 2, 4
-; LE-NEXT:    vspltisb 5, 15
+; LE-NEXT:    xxsldwi 0, 34, 34, 1
+; LE-NEXT:    lis 5, -13108
+; LE-NEXT:    lis 9, 13107
+; LE-NEXT:    xxswapd 1, 34
+; LE-NEXT:    lis 4, 21845
+; LE-NEXT:    lis 10, -3856
+; LE-NEXT:    lis 3, -21846
+; LE-NEXT:    xxsldwi 2, 35, 35, 1
+; LE-NEXT:    ori 6, 5, 52428
+; LE-NEXT:    ori 5, 9, 13107
+; LE-NEXT:    mffprwz 9, 0
+; LE-NEXT:    ori 7, 4, 21845
+; LE-NEXT:    ori 4, 10, 61680
+; LE-NEXT:    mffprwz 10, 1
+; LE-NEXT:    ori 8, 3, 43690
+; LE-NEXT:    lis 11, 3855
+; LE-NEXT:    ori 3, 11, 3855
+; LE-NEXT:    mffprwz 11, 2
+; LE-NEXT:    xxswapd 3, 35
+; LE-NEXT:    mffprwz 12, 3
+; LE-NEXT:    xxsldwi 4, 35, 35, 3
+; LE-NEXT:    xxsldwi 5, 34, 34, 3
+; LE-NEXT:    vspltisw 4, 1
 ; LE-NEXT:    vspltisw 0, 4
-; LE-NEXT:    lxvd2x 0, 0, 3
-; LE-NEXT:    vsrw 16, 2, 7
-; LE-NEXT:    addis 3, 2, .LCPI6_1 at toc@ha
-; LE-NEXT:    vspltisw 1, 2
-; LE-NEXT:    vspltisw 6, 1
+; LE-NEXT:    vspltisw 5, 8
+; LE-NEXT:    slwi 0, 9, 1
+; LE-NEXT:    srwi 9, 9, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 9, 9, 7
+; LE-NEXT:    vsldoi 11, 4, 4, 1
 ; LE-NEXT:    vsldoi 10, 0, 0, 1
-; LE-NEXT:    addi 3, 3, .LCPI6_1 at toc@l
-; LE-NEXT:    vsldoi 13, 0, 0, 2
-; LE-NEXT:    vsldoi 9, 1, 1, 1
-; LE-NEXT:    vsldoi 12, 1, 1, 2
-; LE-NEXT:    vsldoi 14, 4, 4, 2
-; LE-NEXT:    xxland 1, 49, 0
-; LE-NEXT:    vsldoi 8, 6, 6, 1
-; LE-NEXT:    vsldoi 11, 6, 6, 2
-; LE-NEXT:    vsldoi 15, 6, 6, 3
-; LE-NEXT:    xxlor 1, 1, 48
-; LE-NEXT:    vslw 16, 2, 7
-; LE-NEXT:    xxland 34, 34, 0
-; LE-NEXT:    vslw 2, 2, 4
-; LE-NEXT:    xxlor 2, 48, 34
-; LE-NEXT:    xxlor 34, 2, 1
-; LE-NEXT:    xxland 49, 34, 37
-; LE-NEXT:    vsrw 2, 2, 0
-; LE-NEXT:    vslw 17, 17, 0
-; LE-NEXT:    xxland 1, 34, 37
-; LE-NEXT:    xxlor 34, 1, 49
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_2 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI6_2 at toc@l
-; LE-NEXT:    xxland 50, 34, 1
-; LE-NEXT:    vsrw 2, 2, 1
-; LE-NEXT:    vslw 18, 18, 1
-; LE-NEXT:    xxland 2, 34, 1
-; LE-NEXT:    xxlor 34, 2, 50
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_3 at toc@ha
-; LE-NEXT:    vsrw 19, 2, 6
-; LE-NEXT:    addi 3, 3, .LCPI6_3 at toc@l
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_4 at toc@ha
-; LE-NEXT:    xxland 34, 34, 2
-; LE-NEXT:    xxland 3, 51, 2
-; LE-NEXT:    vsrw 19, 3, 4
-; LE-NEXT:    addi 3, 3, .LCPI6_4 at toc@l
-; LE-NEXT:    vadduwm 2, 2, 2
-; LE-NEXT:    xxlor 34, 3, 34
-; LE-NEXT:    xxland 3, 51, 0
-; LE-NEXT:    vsrw 19, 3, 7
-; LE-NEXT:    xxlor 3, 3, 51
-; LE-NEXT:    vslw 19, 3, 7
-; LE-NEXT:    xxland 35, 35, 0
-; LE-NEXT:    vslw 3, 3, 4
-; LE-NEXT:    vsldoi 16, 1, 1, 3
-; LE-NEXT:    xxlor 4, 51, 35
-; LE-NEXT:    xxlor 35, 4, 3
-; LE-NEXT:    xxland 51, 35, 37
-; LE-NEXT:    vsrw 3, 3, 0
-; LE-NEXT:    vslw 19, 19, 0
-; LE-NEXT:    xxland 3, 35, 37
-; LE-NEXT:    xxlor 35, 3, 51
-; LE-NEXT:    xxland 51, 35, 1
-; LE-NEXT:    vsrw 3, 3, 1
-; LE-NEXT:    vslw 19, 19, 1
-; LE-NEXT:    xxland 3, 35, 1
-; LE-NEXT:    xxlor 35, 3, 51
-; LE-NEXT:    vsrw 19, 3, 6
-; LE-NEXT:    xxland 35, 35, 2
-; LE-NEXT:    xxland 3, 51, 2
-; LE-NEXT:    vadduwm 3, 3, 3
-; LE-NEXT:    xxlor 3, 3, 35
-; LE-NEXT:    xxland 35, 3, 33
-; LE-NEXT:    xxland 51, 3, 38
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    vmuluwm 19, 2, 19
-; LE-NEXT:    xxlxor 4, 51, 35
-; LE-NEXT:    xxland 35, 3, 32
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 36
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    vadduwm 3, 4, 4
-; LE-NEXT:    xxland 35, 3, 35
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    vsldoi 17, 0, 0, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_5 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_5 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    vslw 3, 0, 0
-; LE-NEXT:    xxland 35, 3, 35
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_6 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_6 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 40
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 41
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 42
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    vslw 3, 4, 4
-; LE-NEXT:    xxland 35, 3, 35
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_7 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_7 at toc@l
-; LE-NEXT:    vsldoi 18, 4, 4, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_8 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_8 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_9 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_9 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_10 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_10 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 43
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 44
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 45
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 46
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_11 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_11 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_12 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_12 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_13 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_13 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_14 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_14 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 47
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 48
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 49
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 50
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI6_15 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI6_15 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    vsldoi 7, 0, 0, 2
+; LE-NEXT:    vsldoi 1, 4, 4, 2
+; LE-NEXT:    vsldoi 8, 5, 5, 2
+; LE-NEXT:    or 9, 9, 0
+; LE-NEXT:    slwi 0, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 0
+; LE-NEXT:    slwi 0, 11, 1
+; LE-NEXT:    srwi 11, 11, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 11, 11, 7
+; LE-NEXT:    or 11, 11, 0
+; LE-NEXT:    slwi 0, 12, 1
+; LE-NEXT:    srwi 12, 12, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 12, 12, 7
+; LE-NEXT:    or 12, 12, 0
+; LE-NEXT:    slwi 0, 9, 2
+; LE-NEXT:    srwi 9, 9, 2
+; LE-NEXT:    and 0, 0, 6
+; LE-NEXT:    and 9, 9, 5
+; LE-NEXT:    or 9, 9, 0
+; LE-NEXT:    slwi 0, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 0, 0, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 0
+; LE-NEXT:    slwi 0, 11, 2
+; LE-NEXT:    srwi 11, 11, 2
+; LE-NEXT:    and 0, 0, 6
+; LE-NEXT:    and 11, 11, 5
+; LE-NEXT:    or 11, 11, 0
+; LE-NEXT:    slwi 0, 9, 4
+; LE-NEXT:    srwi 9, 9, 4
+; LE-NEXT:    and 0, 0, 4
+; LE-NEXT:    and 9, 9, 3
+; LE-NEXT:    or 9, 9, 0
+; LE-NEXT:    slwi 0, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 0, 0, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 0
+; LE-NEXT:    slwi 0, 11, 4
+; LE-NEXT:    srwi 11, 11, 4
+; LE-NEXT:    and 0, 0, 4
+; LE-NEXT:    and 11, 11, 3
+; LE-NEXT:    or 11, 11, 0
+; LE-NEXT:    rotlwi 0, 9, 24
+; LE-NEXT:    rlwimi 0, 9, 8, 8, 15
+; LE-NEXT:    rlwimi 0, 9, 8, 24, 31
+; LE-NEXT:    rotlwi 9, 10, 24
+; LE-NEXT:    rlwimi 9, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 9, 10, 8, 24, 31
+; LE-NEXT:    rotlwi 10, 11, 24
+; LE-NEXT:    rlwimi 10, 11, 8, 8, 15
+; LE-NEXT:    rlwimi 10, 11, 8, 24, 31
+; LE-NEXT:    rldicl 11, 0, 0, 32
+; LE-NEXT:    rldicl 0, 9, 0, 32
+; LE-NEXT:    mffprwz 9, 5
+; LE-NEXT:    rldicl 10, 10, 0, 32
+; LE-NEXT:    rldimi 0, 11, 32, 0
+; LE-NEXT:    slwi 11, 12, 2
+; LE-NEXT:    srwi 12, 12, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 12, 12, 5
+; LE-NEXT:    mtfprd 1, 0
+; LE-NEXT:    or 11, 12, 11
+; LE-NEXT:    slwi 12, 11, 4
+; LE-NEXT:    srwi 11, 11, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 11, 11, 3
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    rotlwi 12, 11, 24
+; LE-NEXT:    rlwimi 12, 11, 8, 8, 15
+; LE-NEXT:    rlwimi 12, 11, 8, 24, 31
+; LE-NEXT:    rldicl 11, 12, 0, 32
+; LE-NEXT:    rldimi 11, 10, 32, 0
+; LE-NEXT:    mffprwz 10, 4
+; LE-NEXT:    mtfprd 0, 11
+; LE-NEXT:    mfvsrwz 11, 35
+; LE-NEXT:    slwi 12, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    rotlwi 12, 10, 24
+; LE-NEXT:    rlwimi 12, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 12, 10, 8, 24, 31
+; LE-NEXT:    slwi 10, 11, 1
+; LE-NEXT:    srwi 11, 11, 1
+; LE-NEXT:    rldicl 12, 12, 0, 32
+; LE-NEXT:    and 10, 10, 8
+; LE-NEXT:    and 11, 11, 7
+; LE-NEXT:    or 10, 11, 10
+; LE-NEXT:    slwi 11, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    slwi 11, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 11, 11, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    rotlwi 11, 10, 24
+; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
+; LE-NEXT:    mfvsrwz 10, 34
+; LE-NEXT:    rldicl 11, 11, 0, 32
+; LE-NEXT:    rldimi 11, 12, 32, 0
+; LE-NEXT:    mtfprd 2, 11
+; LE-NEXT:    slwi 11, 9, 1
+; LE-NEXT:    srwi 9, 9, 1
+; LE-NEXT:    and 11, 11, 8
+; LE-NEXT:    and 9, 9, 7
+; LE-NEXT:    or 9, 9, 11
+; LE-NEXT:    slwi 11, 9, 2
+; LE-NEXT:    srwi 9, 9, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 9, 9, 5
+; LE-NEXT:    or 9, 9, 11
+; LE-NEXT:    slwi 11, 9, 4
+; LE-NEXT:    srwi 9, 9, 4
+; LE-NEXT:    and 11, 11, 4
+; LE-NEXT:    and 9, 9, 3
+; LE-NEXT:    or 9, 9, 11
+; LE-NEXT:    rotlwi 11, 9, 24
+; LE-NEXT:    rlwimi 11, 9, 8, 8, 15
+; LE-NEXT:    rlwimi 11, 9, 8, 24, 31
+; LE-NEXT:    rldicl 9, 11, 0, 32
+; LE-NEXT:    slwi 11, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 11, 11, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    slwi 11, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    slwi 11, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 11, 11, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    rotlwi 11, 10, 24
+; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
+; LE-NEXT:    rldicl 10, 11, 0, 32
+; LE-NEXT:    addis 11, 2, .LCPI6_12 at toc@ha
+; LE-NEXT:    rldimi 10, 9, 32, 0
+; LE-NEXT:    addis 9, 2, .LCPI6_0 at toc@ha
+; LE-NEXT:    addi 9, 9, .LCPI6_0 at toc@l
+; LE-NEXT:    vspltisw 3, 2
+; LE-NEXT:    vsldoi 9, 3, 3, 1
+; LE-NEXT:    vsldoi 6, 3, 3, 2
+; LE-NEXT:    xxmrghd 0, 2, 0
+; LE-NEXT:    mtfprd 2, 10
+; LE-NEXT:    xxland 44, 0, 35
+; LE-NEXT:    xxland 45, 0, 36
+; LE-NEXT:    xxland 43, 0, 43
+; LE-NEXT:    xxland 41, 0, 41
+; LE-NEXT:    addis 10, 2, .LCPI6_11 at toc@ha
+; LE-NEXT:    addi 10, 10, .LCPI6_11 at toc@l
+; LE-NEXT:    vsldoi 4, 4, 4, 3
+; LE-NEXT:    xxland 36, 0, 36
+; LE-NEXT:    lxvd2x 3, 0, 10
+; LE-NEXT:    xxmrghd 34, 2, 1
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_1 at toc@ha
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    addi 9, 9, .LCPI6_1 at toc@l
+; LE-NEXT:    vmuluwm 11, 2, 11
+; LE-NEXT:    vmuluwm 9, 2, 9
+; LE-NEXT:    vmuluwm 4, 2, 4
+; LE-NEXT:    xxlxor 1, 45, 44
+; LE-NEXT:    xxland 44, 0, 32
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxland 44, 0, 37
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    vadduwm 12, 5, 5
+; LE-NEXT:    xxland 44, 0, 44
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxland 44, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_2 at toc@ha
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    addi 9, 9, .LCPI6_2 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    vslw 12, 0, 0
+; LE-NEXT:    xxland 44, 0, 44
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxland 44, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_3 at toc@ha
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    addi 9, 9, .LCPI6_3 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxlxor 1, 1, 43
+; LE-NEXT:    xxlxor 1, 1, 41
+; LE-NEXT:    xxland 41, 0, 42
+; LE-NEXT:    vslw 10, 5, 5
+; LE-NEXT:    vmuluwm 9, 2, 9
+; LE-NEXT:    xxlxor 1, 1, 41
+; LE-NEXT:    vsldoi 5, 5, 5, 3
+; LE-NEXT:    vsldoi 9, 3, 3, 3
+; LE-NEXT:    xxland 35, 0, 42
 ; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    xxlxor 1, 1, 35
+; LE-NEXT:    vsldoi 3, 0, 0, 3
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_4 at toc@ha
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    addi 9, 9, .LCPI6_4 at toc@l
+; LE-NEXT:    xxland 35, 0, 35
 ; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_5 at toc@ha
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    addi 9, 9, .LCPI6_5 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_6 at toc@ha
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    addi 9, 9, .LCPI6_6 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_7 at toc@ha
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    addi 9, 9, .LCPI6_7 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 33
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 38
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 39
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 40
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_8 at toc@ha
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    addi 9, 9, .LCPI6_8 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_9 at toc@ha
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    addi 9, 9, .LCPI6_9 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI6_10 at toc@ha
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    addi 9, 9, .LCPI6_10 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addi 9, 11, .LCPI6_12 at toc@l
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    lxvd2x 4, 0, 9
+; LE-NEXT:    xxland 33, 0, 4
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 3
+; LE-NEXT:    vmuluwm 1, 2, 1
+; LE-NEXT:    xxlxor 1, 1, 36
+; LE-NEXT:    xxland 36, 0, 41
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    vmuluwm 4, 2, 4
+; LE-NEXT:    xxlxor 1, 1, 36
+; LE-NEXT:    xxland 36, 0, 37
+; LE-NEXT:    xxland 37, 0, 2
+; LE-NEXT:    vmuluwm 4, 2, 4
+; LE-NEXT:    vmuluwm 5, 2, 5
+; LE-NEXT:    xxlxor 1, 1, 35
 ; LE-NEXT:    xxleqv 35, 35, 35
 ; LE-NEXT:    vslw 3, 3, 3
-; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    xxland 35, 0, 35
 ; LE-NEXT:    vmuluwm 2, 2, 3
-; LE-NEXT:    xxlxor 34, 4, 34
-; LE-NEXT:    vsrw 8, 2, 4
-; LE-NEXT:    vsrw 3, 2, 7
-; LE-NEXT:    xxland 3, 40, 0
-; LE-NEXT:    xxlor 3, 3, 35
-; LE-NEXT:    vslw 3, 2, 7
-; LE-NEXT:    xxland 34, 34, 0
-; LE-NEXT:    vslw 2, 2, 4
-; LE-NEXT:    xxlor 0, 35, 34
-; LE-NEXT:    xxlor 34, 0, 3
-; LE-NEXT:    xxland 35, 34, 37
-; LE-NEXT:    vsrw 2, 2, 0
-; LE-NEXT:    vslw 3, 3, 0
-; LE-NEXT:    xxland 0, 34, 37
-; LE-NEXT:    xxlor 34, 0, 35
-; LE-NEXT:    xxland 35, 34, 1
-; LE-NEXT:    vsrw 2, 2, 1
-; LE-NEXT:    vslw 3, 3, 1
-; LE-NEXT:    xxland 0, 34, 1
-; LE-NEXT:    xxlor 34, 0, 35
-; LE-NEXT:    vsrw 3, 2, 6
-; LE-NEXT:    xxland 34, 34, 2
-; LE-NEXT:    xxland 0, 35, 2
-; LE-NEXT:    vadduwm 2, 2, 2
-; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    xxlxor 0, 1, 36
+; LE-NEXT:    xxlxor 0, 0, 37
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxlxor 0, 0, 33
+; LE-NEXT:    xxlxor 0, 0, 34
+; LE-NEXT:    xxsldwi 1, 0, 0, 1
+; LE-NEXT:    xxswapd 2, 0
+; LE-NEXT:    xxsldwi 3, 0, 0, 3
+; LE-NEXT:    mffprwz 9, 1
+; LE-NEXT:    mffprwz 10, 2
+; LE-NEXT:    mffprwz 11, 3
+; LE-NEXT:    slwi 12, 9, 1
+; LE-NEXT:    srwi 9, 9, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 9, 9, 7
+; LE-NEXT:    or 9, 9, 12
+; LE-NEXT:    slwi 12, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 11, 1
+; LE-NEXT:    srwi 11, 11, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 11, 11, 7
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    slwi 12, 9, 2
+; LE-NEXT:    srwi 9, 9, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 9, 9, 5
+; LE-NEXT:    or 9, 9, 12
+; LE-NEXT:    slwi 12, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 11, 2
+; LE-NEXT:    srwi 11, 11, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 11, 11, 5
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    slwi 12, 9, 4
+; LE-NEXT:    srwi 9, 9, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 9, 9, 3
+; LE-NEXT:    or 9, 9, 12
+; LE-NEXT:    slwi 12, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 11, 4
+; LE-NEXT:    srwi 11, 11, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 11, 11, 3
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    rotlwi 12, 9, 24
+; LE-NEXT:    rlwimi 12, 9, 8, 8, 15
+; LE-NEXT:    rlwimi 12, 9, 8, 24, 31
+; LE-NEXT:    rotlwi 9, 10, 24
+; LE-NEXT:    rlwimi 9, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 9, 10, 8, 24, 31
+; LE-NEXT:    rotlwi 10, 11, 24
+; LE-NEXT:    rldicl 9, 9, 0, 32
+; LE-NEXT:    rlwimi 10, 11, 8, 8, 15
+; LE-NEXT:    rlwimi 10, 11, 8, 24, 31
+; LE-NEXT:    rldicl 11, 12, 0, 32
+; LE-NEXT:    rldimi 9, 11, 32, 0
+; LE-NEXT:    mtfprd 1, 9
+; LE-NEXT:    rldicl 9, 10, 0, 32
+; LE-NEXT:    mffprwz 10, 0
+; LE-NEXT:    slwi 11, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 8, 11, 8
+; LE-NEXT:    and 7, 10, 7
+; LE-NEXT:    or 7, 7, 8
+; LE-NEXT:    slwi 8, 7, 2
+; LE-NEXT:    srwi 7, 7, 2
+; LE-NEXT:    and 6, 8, 6
+; LE-NEXT:    and 5, 7, 5
+; LE-NEXT:    or 5, 5, 6
+; LE-NEXT:    slwi 6, 5, 4
+; LE-NEXT:    srwi 5, 5, 4
+; LE-NEXT:    and 4, 6, 4
+; LE-NEXT:    and 3, 5, 3
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rotlwi 4, 3, 24
+; LE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; LE-NEXT:    rldicl 3, 4, 0, 32
+; LE-NEXT:    rldimi 3, 9, 32, 0
+; LE-NEXT:    mtfprd 0, 3
+; LE-NEXT:    xxmrghd 34, 0, 1
 ; LE-NEXT:    blr
   %a.ext = zext <4 x i32> %a to <4 x i64>
   %b.ext = zext <4 x i32> %b to <4 x i64>
@@ -5429,1720 +5804,2101 @@ define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    rlwinm 4, 9, 0, 5, 5
 ; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rlwinm 4, 9, 0, 4, 4
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rlwinm 4, 9, 0, 3, 3
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rlwinm 4, 9, 0, 2, 2
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rlwinm 4, 9, 0, 1, 1
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rlwinm 4, 9, 0, 0, 0
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 32, 32
-; LE-NEXT:    rldicl 4, 4, 32, 31
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 31, 33
-; LE-NEXT:    rldicl 4, 4, 33, 30
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 30, 34
-; LE-NEXT:    rldicl 4, 4, 34, 29
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 29, 35
-; LE-NEXT:    rldicl 4, 4, 35, 28
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 28, 36
-; LE-NEXT:    rldicl 4, 4, 36, 27
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 27, 37
-; LE-NEXT:    rldicl 4, 4, 37, 26
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 26, 38
-; LE-NEXT:    rldicl 4, 4, 38, 25
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 25, 39
-; LE-NEXT:    rldicl 4, 4, 39, 24
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 24, 40
-; LE-NEXT:    rldicl 4, 4, 40, 23
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 23, 41
-; LE-NEXT:    rldicl 4, 4, 41, 22
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 22, 42
-; LE-NEXT:    rldicl 4, 4, 42, 21
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 21, 43
-; LE-NEXT:    rldicl 4, 4, 43, 20
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 20, 44
-; LE-NEXT:    rldicl 4, 4, 44, 19
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 19, 45
-; LE-NEXT:    rldicl 4, 4, 45, 18
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 18, 46
-; LE-NEXT:    rldicl 4, 4, 46, 17
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 17, 47
-; LE-NEXT:    rldicl 4, 4, 47, 16
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 16, 48
-; LE-NEXT:    rldicl 4, 4, 48, 15
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 15, 49
-; LE-NEXT:    rldicl 4, 4, 49, 14
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 14, 50
-; LE-NEXT:    rldicl 4, 4, 50, 13
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 13, 51
-; LE-NEXT:    rldicl 4, 4, 51, 12
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 12, 52
-; LE-NEXT:    rldicl 4, 4, 52, 11
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 11, 53
-; LE-NEXT:    rldicl 4, 4, 53, 10
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 10, 54
-; LE-NEXT:    rldicl 4, 4, 54, 9
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 9, 55
-; LE-NEXT:    rldicl 4, 4, 55, 8
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 8, 56
-; LE-NEXT:    rldicl 4, 4, 56, 7
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 7, 57
-; LE-NEXT:    rldicl 4, 4, 57, 6
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 6, 58
-; LE-NEXT:    rldicl 4, 4, 58, 5
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 5, 59
-; LE-NEXT:    rldicl 4, 4, 59, 4
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 4, 60
-; LE-NEXT:    rldicl 4, 4, 60, 3
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 3, 61
-; LE-NEXT:    rldicl 4, 4, 61, 2
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicl 4, 9, 2, 62
-; LE-NEXT:    rldicl 4, 4, 62, 1
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    rldicr 4, 9, 0, 0
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    sldi 4, 3, 1
-; LE-NEXT:    rldicl 3, 3, 63, 1
-; LE-NEXT:    and 4, 4, 7
-; LE-NEXT:    and 3, 3, 6
-; LE-NEXT:    or 3, 3, 4
-; LE-NEXT:    sldi 4, 3, 2
-; LE-NEXT:    rldicl 3, 3, 62, 2
-; LE-NEXT:    and 4, 4, 11
-; LE-NEXT:    and 3, 3, 8
-; LE-NEXT:    or 3, 3, 4
-; LE-NEXT:    sldi 4, 3, 4
-; LE-NEXT:    rldicl 3, 3, 60, 4
-; LE-NEXT:    and 4, 4, 0
-; LE-NEXT:    and 3, 3, 12
-; LE-NEXT:    or 3, 3, 4
-; LE-NEXT:    rldicl 4, 3, 32, 32
-; LE-NEXT:    rotlwi 5, 4, 24
-; LE-NEXT:    rlwimi 5, 4, 8, 8, 15
-; LE-NEXT:    rlwimi 5, 4, 8, 24, 31
-; LE-NEXT:    rotlwi 4, 3, 24
-; LE-NEXT:    rlwimi 4, 3, 8, 8, 15
-; LE-NEXT:    rlwimi 4, 3, 8, 24, 31
-; LE-NEXT:    sldi 3, 4, 32
-; LE-NEXT:    or 3, 3, 5
-; LE-NEXT:    mtfprd 1, 3
-; LE-NEXT:    xxmrghd 34, 1, 0
-; LE-NEXT:    addi 1, 1, 752
-; LE-NEXT:    blr
-  %a.ext = zext <2 x i64> %a to <2 x i128>
-  %b.ext = zext <2 x i64> %b to <2 x i128>
-  %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
-  %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
-  %res = trunc <2 x i128> %res.ext to <2 x i64>
-  ret <2 x i64> %res
-}
-
-define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
-; BE-LABEL: clmulh_v16i8:
-; BE:       # %bb.0:
-; BE-NEXT:    li 3, -48
-; BE-NEXT:    vspltisb 4, 4
-; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, -32
-; BE-NEXT:    vsrb 1, 3, 4
-; BE-NEXT:    vspltisb 5, 15
-; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, -16
-; BE-NEXT:    vspltisb 7, -1
-; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI8_0 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI8_0 at toc@l
-; BE-NEXT:    vand 3, 3, 5
-; BE-NEXT:    vspltisb 13, 8
-; BE-NEXT:    vslb 3, 3, 4
-; BE-NEXT:    vsrb 0, 2, 4
-; BE-NEXT:    vand 2, 2, 5
-; BE-NEXT:    vor 1, 1, 3
-; BE-NEXT:    lvx 3, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI8_1 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI8_1 at toc@l
-; BE-NEXT:    vslb 2, 2, 4
-; BE-NEXT:    vor 0, 0, 2
-; BE-NEXT:    vspltisb 2, 2
-; BE-NEXT:    vsrb 9, 1, 2
-; BE-NEXT:    vand 1, 1, 3
-; BE-NEXT:    vand 9, 9, 3
-; BE-NEXT:    vslb 1, 1, 2
-; BE-NEXT:    vsrb 8, 0, 2
-; BE-NEXT:    vand 0, 0, 3
-; BE-NEXT:    vor 9, 9, 1
-; BE-NEXT:    lvx 1, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI8_3 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI8_3 at toc@l
-; BE-NEXT:    lvx 15, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI8_2 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI8_2 at toc@l
-; BE-NEXT:    vand 8, 8, 3
-; BE-NEXT:    vslb 0, 0, 2
-; BE-NEXT:    vor 8, 8, 0
-; BE-NEXT:    vspltisb 0, 1
-; BE-NEXT:    vsrb 11, 9, 0
-; BE-NEXT:    vand 9, 9, 1
-; BE-NEXT:    vaddubm 9, 9, 9
-; BE-NEXT:    vand 11, 11, 1
-; BE-NEXT:    vsrb 10, 8, 0
-; BE-NEXT:    vand 8, 8, 1
-; BE-NEXT:    vaddubm 8, 8, 8
-; BE-NEXT:    vor 9, 11, 9
-; BE-NEXT:    vslb 6, 4, 4
-; BE-NEXT:    vslb 7, 7, 7
-; BE-NEXT:    vand 10, 10, 1
-; BE-NEXT:    vand 14, 9, 13
-; BE-NEXT:    vaddubm 13, 13, 13
-; BE-NEXT:    vor 8, 10, 8
-; BE-NEXT:    vand 10, 9, 2
-; BE-NEXT:    vand 11, 9, 0
-; BE-NEXT:    vand 12, 9, 4
-; BE-NEXT:    vand 13, 9, 13
-; BE-NEXT:    vand 15, 9, 15
-; BE-NEXT:    vand 6, 9, 6
-; BE-NEXT:    vand 7, 9, 7
-; BE-NEXT:    vmuloub 9, 8, 10
-; BE-NEXT:    vmuleub 10, 8, 10
-; BE-NEXT:    vmuloub 16, 8, 11
-; BE-NEXT:    vmuleub 11, 8, 11
-; BE-NEXT:    vmuloub 17, 8, 12
-; BE-NEXT:    vmuleub 12, 8, 12
-; BE-NEXT:    vmuloub 18, 8, 14
-; BE-NEXT:    vmuleub 14, 8, 14
-; BE-NEXT:    vmuloub 19, 8, 13
-; BE-NEXT:    vmuleub 13, 8, 13
-; BE-NEXT:    vmuloub 31, 8, 15
-; BE-NEXT:    vmuleub 15, 8, 15
-; BE-NEXT:    vmuloub 30, 8, 6
-; BE-NEXT:    vmuleub 6, 8, 6
-; BE-NEXT:    vmuloub 29, 8, 7
-; BE-NEXT:    vmuleub 7, 8, 7
-; BE-NEXT:    lvx 8, 0, 3
-; BE-NEXT:    li 3, -16
-; BE-NEXT:    vperm 9, 10, 9, 8
-; BE-NEXT:    vperm 10, 11, 16, 8
-; BE-NEXT:    vperm 11, 12, 17, 8
-; BE-NEXT:    vperm 12, 14, 18, 8
-; BE-NEXT:    vperm 13, 13, 19, 8
-; BE-NEXT:    vperm 14, 15, 31, 8
-; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, -32
-; BE-NEXT:    vperm 6, 6, 30, 8
-; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, -48
-; BE-NEXT:    vperm 7, 7, 29, 8
-; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    vxor 8, 10, 9
-; BE-NEXT:    vxor 8, 8, 11
-; BE-NEXT:    vxor 8, 8, 12
-; BE-NEXT:    vxor 8, 8, 13
-; BE-NEXT:    vxor 8, 8, 14
-; BE-NEXT:    vxor 6, 8, 6
-; BE-NEXT:    vxor 6, 6, 7
-; BE-NEXT:    vand 5, 6, 5
-; BE-NEXT:    vsrb 7, 6, 4
-; BE-NEXT:    vslb 4, 5, 4
-; BE-NEXT:    vor 4, 7, 4
-; BE-NEXT:    vand 5, 4, 3
-; BE-NEXT:    vsrb 4, 4, 2
-; BE-NEXT:    vslb 2, 5, 2
-; BE-NEXT:    vand 3, 4, 3
-; BE-NEXT:    vor 2, 3, 2
-; BE-NEXT:    vsrb 3, 2, 0
-; BE-NEXT:    vand 2, 2, 1
-; BE-NEXT:    vaddubm 2, 2, 2
-; BE-NEXT:    vand 3, 3, 1
-; BE-NEXT:    vor 2, 3, 2
-; BE-NEXT:    vsrb 2, 2, 0
-; BE-NEXT:    blr
-;
-; LE-LABEL: clmulh_v16i8:
-; LE:       # %bb.0:
-; LE-NEXT:    addis 3, 2, .LCPI8_0 at toc@ha
-; LE-NEXT:    vspltisb 4, 4
-; LE-NEXT:    vspltisb 5, 2
-; LE-NEXT:    addi 3, 3, .LCPI8_0 at toc@l
-; LE-NEXT:    vslb 1, 3, 4
-; LE-NEXT:    vsrb 3, 3, 4
-; LE-NEXT:    vslb 6, 2, 4
-; LE-NEXT:    vsrb 2, 2, 4
-; LE-NEXT:    lxvd2x 0, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI8_1 at toc@ha
-; LE-NEXT:    xxlor 35, 35, 33
-; LE-NEXT:    xxlor 34, 34, 38
-; LE-NEXT:    vspltisb 0, 1
-; LE-NEXT:    addi 3, 3, .LCPI8_1 at toc@l
-; LE-NEXT:    vsrb 1, 3, 5
-; LE-NEXT:    vsrb 7, 2, 5
-; LE-NEXT:    vspltisb 6, 8
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI8_2 at toc@ha
-; LE-NEXT:    xxland 35, 35, 0
-; LE-NEXT:    xxland 34, 34, 0
-; LE-NEXT:    xxland 2, 33, 0
-; LE-NEXT:    xxland 3, 39, 0
-; LE-NEXT:    addi 3, 3, .LCPI8_2 at toc@l
-; LE-NEXT:    vslb 3, 3, 5
-; LE-NEXT:    vslb 2, 2, 5
-; LE-NEXT:    xxlor 35, 2, 35
-; LE-NEXT:    xxlor 34, 3, 34
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI8_3 at toc@ha
-; LE-NEXT:    vsrb 1, 3, 0
-; LE-NEXT:    xxland 35, 35, 1
-; LE-NEXT:    vsrb 7, 2, 0
-; LE-NEXT:    xxland 34, 34, 1
-; LE-NEXT:    addi 3, 3, .LCPI8_3 at toc@l
-; LE-NEXT:    xxland 2, 33, 1
-; LE-NEXT:    vaddubm 3, 3, 3
-; LE-NEXT:    vaddubm 2, 2, 2
-; LE-NEXT:    xxlor 2, 2, 35
-; LE-NEXT:    xxland 35, 2, 37
-; LE-NEXT:    xxswapd 33, 3
-; LE-NEXT:    xxland 3, 39, 1
-; LE-NEXT:    xxlor 34, 3, 34
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    vmuloub 7, 2, 3
-; LE-NEXT:    vmuleub 3, 2, 3
-; LE-NEXT:    vperm 3, 3, 7, 1
-; LE-NEXT:    xxland 39, 2, 32
-; LE-NEXT:    vmuloub 8, 2, 7
-; LE-NEXT:    vmuleub 7, 2, 7
-; LE-NEXT:    vperm 7, 7, 8, 1
-; LE-NEXT:    xxland 40, 2, 36
-; LE-NEXT:    vmuloub 9, 2, 8
-; LE-NEXT:    vmuleub 8, 2, 8
-; LE-NEXT:    vperm 8, 8, 9, 1
-; LE-NEXT:    xxland 41, 2, 38
-; LE-NEXT:    vaddubm 6, 6, 6
-; LE-NEXT:    vmuloub 10, 2, 9
-; LE-NEXT:    vmuleub 9, 2, 9
-; LE-NEXT:    xxland 38, 2, 38
-; LE-NEXT:    vperm 9, 9, 10, 1
-; LE-NEXT:    vmuloub 10, 2, 6
-; LE-NEXT:    vmuleub 6, 2, 6
-; LE-NEXT:    vperm 6, 6, 10, 1
-; LE-NEXT:    xxland 42, 2, 3
-; LE-NEXT:    vmuloub 11, 2, 10
-; LE-NEXT:    vmuleub 10, 2, 10
-; LE-NEXT:    vperm 10, 10, 11, 1
-; LE-NEXT:    vslb 11, 4, 4
-; LE-NEXT:    xxland 43, 2, 43
-; LE-NEXT:    vmuloub 12, 2, 11
-; LE-NEXT:    vmuleub 11, 2, 11
-; LE-NEXT:    vperm 11, 11, 12, 1
-; LE-NEXT:    xxleqv 44, 44, 44
-; LE-NEXT:    vslb 12, 12, 12
-; LE-NEXT:    xxland 44, 2, 44
-; LE-NEXT:    xxlxor 2, 39, 35
-; LE-NEXT:    xxlxor 2, 2, 40
-; LE-NEXT:    vmuloub 13, 2, 12
-; LE-NEXT:    vmuleub 2, 2, 12
-; LE-NEXT:    xxlxor 2, 2, 41
-; LE-NEXT:    xxlxor 2, 2, 38
-; LE-NEXT:    xxlxor 2, 2, 42
-; LE-NEXT:    xxlxor 2, 2, 43
-; LE-NEXT:    vperm 2, 2, 13, 1
-; LE-NEXT:    xxlxor 34, 2, 34
-; LE-NEXT:    vslb 3, 2, 4
-; LE-NEXT:    vsrb 2, 2, 4
-; LE-NEXT:    xxlor 34, 34, 35
-; LE-NEXT:    xxland 35, 34, 0
-; LE-NEXT:    vsrb 2, 2, 5
-; LE-NEXT:    vslb 3, 3, 5
-; LE-NEXT:    xxland 0, 34, 0
-; LE-NEXT:    xxlor 34, 0, 35
-; LE-NEXT:    vsrb 3, 2, 0
-; LE-NEXT:    xxland 34, 34, 1
-; LE-NEXT:    xxland 0, 35, 1
-; LE-NEXT:    vaddubm 2, 2, 2
-; LE-NEXT:    xxlor 34, 0, 34
-; LE-NEXT:    vsrb 2, 2, 0
-; LE-NEXT:    blr
-  %a.ext = zext <16 x i8> %a to <16 x i16>
-  %b.ext = zext <16 x i8> %b to <16 x i16>
-  %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
-  %res.ext = lshr <16 x i16> %clmul, splat (i16 8)
-  %res = trunc <16 x i16> %res.ext to <16 x i8>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
-; BE-LABEL: clmulh_v8i16:
-; BE:       # %bb.0:
-; BE-NEXT:    li 3, -80
-; BE-NEXT:    vspltish 4, 8
-; BE-NEXT:    vxor 5, 5, 5
-; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, -64
-; BE-NEXT:    vadduhm 19, 4, 4
-; BE-NEXT:    vspltisb 1, -1
-; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, -48
-; BE-NEXT:    vspltish 0, 2
-; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, -32
-; BE-NEXT:    vrlh 8, 2, 4
-; BE-NEXT:    vspltish 2, 4
-; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, -16
-; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI9_0 at toc@l
-; BE-NEXT:    vrlh 6, 3, 4
-; BE-NEXT:    vspltish 3, 1
-; BE-NEXT:    vslh 13, 1, 1
-; BE-NEXT:    vspltisb 1, 15
-; BE-NEXT:    vand 14, 8, 1
-; BE-NEXT:    vsrh 8, 8, 2
-; BE-NEXT:    vand 15, 6, 1
-; BE-NEXT:    vsrh 6, 6, 2
-; BE-NEXT:    vslh 14, 14, 2
-; BE-NEXT:    vand 8, 8, 1
-; BE-NEXT:    vslh 15, 15, 2
-; BE-NEXT:    vand 6, 6, 1
-; BE-NEXT:    vor 8, 8, 14
-; BE-NEXT:    vor 14, 6, 15
-; BE-NEXT:    lvx 6, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI9_1 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI9_1 at toc@l
-; BE-NEXT:    vand 15, 8, 6
-; BE-NEXT:    vsrh 8, 8, 0
-; BE-NEXT:    vslh 15, 15, 0
-; BE-NEXT:    vand 8, 8, 6
-; BE-NEXT:    vor 15, 8, 15
-; BE-NEXT:    lvx 8, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI9_2 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI9_2 at toc@l
-; BE-NEXT:    lvx 31, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI9_3 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI9_3 at toc@l
-; BE-NEXT:    lvx 30, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI9_4 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI9_4 at toc@l
-; BE-NEXT:    vand 16, 14, 6
-; BE-NEXT:    lvx 29, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI9_5 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI9_5 at toc@l
-; BE-NEXT:    lvx 28, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI9_6 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI9_6 at toc@l
-; BE-NEXT:    lvx 27, 0, 3
-; BE-NEXT:    li 3, -16
-; BE-NEXT:    vsrh 14, 14, 0
-; BE-NEXT:    vslh 16, 16, 0
-; BE-NEXT:    vand 14, 14, 6
-; BE-NEXT:    vor 14, 14, 16
-; BE-NEXT:    vsrh 17, 14, 3
-; BE-NEXT:    vand 14, 14, 8
-; BE-NEXT:    vadduhm 14, 14, 14
-; BE-NEXT:    vsrh 16, 15, 3
-; BE-NEXT:    vand 15, 15, 8
-; BE-NEXT:    vadduhm 15, 15, 15
-; BE-NEXT:    vand 17, 17, 8
-; BE-NEXT:    vand 16, 16, 8
-; BE-NEXT:    vor 14, 17, 14
-; BE-NEXT:    vslh 7, 2, 2
-; BE-NEXT:    vsldoi 9, 3, 3, 1
-; BE-NEXT:    vsldoi 10, 0, 0, 1
-; BE-NEXT:    vsldoi 11, 2, 2, 1
-; BE-NEXT:    vslh 12, 4, 4
-; BE-NEXT:    vor 15, 16, 15
-; BE-NEXT:    vand 16, 14, 0
-; BE-NEXT:    vand 17, 14, 3
-; BE-NEXT:    vand 18, 14, 2
-; BE-NEXT:    vand 19, 14, 19
-; BE-NEXT:    vand 31, 14, 31
-; BE-NEXT:    vand 7, 14, 7
-; BE-NEXT:    vand 30, 14, 30
-; BE-NEXT:    vand 9, 14, 9
-; BE-NEXT:    vand 10, 14, 10
-; BE-NEXT:    vand 11, 14, 11
-; BE-NEXT:    vand 12, 14, 12
-; BE-NEXT:    vand 29, 14, 29
-; BE-NEXT:    vand 28, 14, 28
-; BE-NEXT:    vand 27, 14, 27
-; BE-NEXT:    vand 13, 14, 13
-; BE-NEXT:    vand 14, 14, 4
-; BE-NEXT:    vmladduhm 16, 15, 16, 5
-; BE-NEXT:    vmladduhm 17, 15, 17, 5
-; BE-NEXT:    vmladduhm 18, 15, 18, 5
-; BE-NEXT:    vmladduhm 14, 15, 14, 5
-; BE-NEXT:    vmladduhm 19, 15, 19, 5
-; BE-NEXT:    vmladduhm 31, 15, 31, 5
-; BE-NEXT:    vmladduhm 7, 15, 7, 5
-; BE-NEXT:    vmladduhm 30, 15, 30, 5
-; BE-NEXT:    vmladduhm 9, 15, 9, 5
-; BE-NEXT:    vmladduhm 10, 15, 10, 5
-; BE-NEXT:    vmladduhm 11, 15, 11, 5
-; BE-NEXT:    vmladduhm 12, 15, 12, 5
-; BE-NEXT:    vmladduhm 29, 15, 29, 5
-; BE-NEXT:    vmladduhm 28, 15, 28, 5
-; BE-NEXT:    vmladduhm 27, 15, 27, 5
-; BE-NEXT:    vmladduhm 5, 15, 13, 5
-; BE-NEXT:    vxor 13, 17, 16
-; BE-NEXT:    vxor 13, 13, 18
-; BE-NEXT:    vxor 13, 13, 14
-; BE-NEXT:    vxor 13, 13, 19
-; BE-NEXT:    vxor 13, 13, 31
-; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, -32
-; BE-NEXT:    vxor 7, 13, 7
-; BE-NEXT:    vxor 7, 7, 30
-; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, -48
-; BE-NEXT:    vxor 7, 7, 9
-; BE-NEXT:    vxor 7, 7, 10
-; BE-NEXT:    vxor 7, 7, 11
-; BE-NEXT:    vxor 7, 7, 12
-; BE-NEXT:    vxor 7, 7, 29
-; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, -64
-; BE-NEXT:    vxor 7, 7, 28
-; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, -80
-; BE-NEXT:    vxor 7, 7, 27
-; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    vxor 5, 7, 5
-; BE-NEXT:    vrlh 4, 5, 4
-; BE-NEXT:    vand 5, 4, 1
-; BE-NEXT:    vsrh 4, 4, 2
-; BE-NEXT:    vslh 2, 5, 2
-; BE-NEXT:    vand 4, 4, 1
-; BE-NEXT:    vor 2, 4, 2
-; BE-NEXT:    vand 4, 2, 6
-; BE-NEXT:    vsrh 2, 2, 0
-; BE-NEXT:    vslh 4, 4, 0
-; BE-NEXT:    vand 2, 2, 6
-; BE-NEXT:    vor 2, 2, 4
-; BE-NEXT:    vsrh 4, 2, 3
-; BE-NEXT:    vand 2, 2, 8
-; BE-NEXT:    vadduhm 2, 2, 2
-; BE-NEXT:    vand 4, 4, 8
-; BE-NEXT:    vor 2, 4, 2
-; BE-NEXT:    vsrh 2, 2, 3
-; BE-NEXT:    blr
-;
-; LE-LABEL: clmulh_v8i16:
-; LE:       # %bb.0:
-; LE-NEXT:    vspltish 5, 8
-; LE-NEXT:    vspltisb 4, 15
-; LE-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
-; LE-NEXT:    vrlh 2, 2, 5
-; LE-NEXT:    vspltish 0, 4
-; LE-NEXT:    addi 3, 3, .LCPI9_0 at toc@l
-; LE-NEXT:    vspltish 6, 2
-; LE-NEXT:    vspltish 1, 1
-; LE-NEXT:    vrlh 3, 3, 5
-; LE-NEXT:    xxland 42, 34, 36
-; LE-NEXT:    vsrh 2, 2, 0
-; LE-NEXT:    vslh 10, 10, 0
-; LE-NEXT:    xxland 0, 34, 36
-; LE-NEXT:    vsldoi 7, 1, 1, 1
-; LE-NEXT:    vsldoi 8, 6, 6, 1
-; LE-NEXT:    vsldoi 9, 0, 0, 1
-; LE-NEXT:    xxlor 34, 0, 42
-; LE-NEXT:    lxvd2x 0, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI9_1 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI9_1 at toc@l
-; LE-NEXT:    xxland 42, 34, 0
-; LE-NEXT:    vsrh 2, 2, 6
-; LE-NEXT:    vslh 10, 10, 6
-; LE-NEXT:    xxland 1, 34, 0
-; LE-NEXT:    xxlor 34, 1, 42
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI9_2 at toc@ha
-; LE-NEXT:    vsrh 10, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI9_2 at toc@l
-; LE-NEXT:    lxvd2x 4, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI9_3 at toc@ha
-; LE-NEXT:    xxland 34, 34, 1
-; LE-NEXT:    xxland 2, 42, 1
-; LE-NEXT:    xxland 42, 35, 36
-; LE-NEXT:    vsrh 3, 3, 0
-; LE-NEXT:    addi 3, 3, .LCPI9_3 at toc@l
-; LE-NEXT:    vadduhm 2, 2, 2
-; LE-NEXT:    vslh 10, 10, 0
-; LE-NEXT:    xxlor 34, 2, 34
-; LE-NEXT:    xxland 2, 35, 36
-; LE-NEXT:    xxlor 35, 2, 42
-; LE-NEXT:    xxland 42, 35, 0
-; LE-NEXT:    vsrh 3, 3, 6
-; LE-NEXT:    vslh 10, 10, 6
-; LE-NEXT:    xxland 2, 35, 0
-; LE-NEXT:    xxlor 35, 2, 42
-; LE-NEXT:    vsrh 10, 3, 1
-; LE-NEXT:    xxland 35, 35, 1
-; LE-NEXT:    xxland 2, 42, 1
-; LE-NEXT:    vadduhm 3, 3, 3
-; LE-NEXT:    xxlor 2, 2, 35
-; LE-NEXT:    vxor 3, 3, 3
-; LE-NEXT:    xxland 42, 2, 38
-; LE-NEXT:    xxland 43, 2, 33
-; LE-NEXT:    xxland 39, 2, 39
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    vmladduhm 11, 2, 11, 3
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 43, 42
-; LE-NEXT:    xxland 42, 2, 32
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxland 42, 2, 37
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    vadduhm 10, 5, 5
-; LE-NEXT:    xxland 42, 2, 42
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxland 42, 2, 4
-; LE-NEXT:    lxvd2x 4, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI9_4 at toc@ha
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    addi 3, 3, .LCPI9_4 at toc@l
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    vslh 10, 0, 0
-; LE-NEXT:    xxland 42, 2, 42
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxland 42, 2, 4
-; LE-NEXT:    lxvd2x 4, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI9_5 at toc@ha
-; LE-NEXT:    vmladduhm 10, 2, 10, 3
-; LE-NEXT:    addi 3, 3, .LCPI9_5 at toc@l
-; LE-NEXT:    xxlxor 3, 3, 42
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 40
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 41
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    vslh 7, 5, 5
-; LE-NEXT:    xxland 39, 2, 39
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 4
-; LE-NEXT:    lxvd2x 4, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI9_6 at toc@ha
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    addi 3, 3, .LCPI9_6 at toc@l
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 4
-; LE-NEXT:    lxvd2x 4, 0, 3
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxland 39, 2, 4
-; LE-NEXT:    vmladduhm 7, 2, 7, 3
-; LE-NEXT:    xxlxor 3, 3, 39
-; LE-NEXT:    xxleqv 39, 39, 39
-; LE-NEXT:    vslh 7, 7, 7
-; LE-NEXT:    xxland 39, 2, 39
-; LE-NEXT:    vmladduhm 2, 2, 7, 3
-; LE-NEXT:    xxlxor 34, 3, 34
-; LE-NEXT:    vrlh 2, 2, 5
-; LE-NEXT:    xxland 35, 34, 36
-; LE-NEXT:    vsrh 2, 2, 0
-; LE-NEXT:    vslh 3, 3, 0
-; LE-NEXT:    xxland 2, 34, 36
-; LE-NEXT:    xxlor 34, 2, 35
-; LE-NEXT:    xxland 35, 34, 0
-; LE-NEXT:    vsrh 2, 2, 6
-; LE-NEXT:    vslh 3, 3, 6
-; LE-NEXT:    xxland 0, 34, 0
-; LE-NEXT:    xxlor 34, 0, 35
-; LE-NEXT:    vsrh 3, 2, 1
-; LE-NEXT:    xxland 34, 34, 1
-; LE-NEXT:    xxland 0, 35, 1
-; LE-NEXT:    vadduhm 2, 2, 2
-; LE-NEXT:    xxlor 34, 0, 34
-; LE-NEXT:    vsrh 2, 2, 1
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 4, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 3, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 2, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 1, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 32, 32
+; LE-NEXT:    rldicl 4, 4, 32, 31
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 31, 33
+; LE-NEXT:    rldicl 4, 4, 33, 30
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 30, 34
+; LE-NEXT:    rldicl 4, 4, 34, 29
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 29, 35
+; LE-NEXT:    rldicl 4, 4, 35, 28
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 28, 36
+; LE-NEXT:    rldicl 4, 4, 36, 27
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 27, 37
+; LE-NEXT:    rldicl 4, 4, 37, 26
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 26, 38
+; LE-NEXT:    rldicl 4, 4, 38, 25
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 25, 39
+; LE-NEXT:    rldicl 4, 4, 39, 24
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 24, 40
+; LE-NEXT:    rldicl 4, 4, 40, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 23, 41
+; LE-NEXT:    rldicl 4, 4, 41, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 22, 42
+; LE-NEXT:    rldicl 4, 4, 42, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 21, 43
+; LE-NEXT:    rldicl 4, 4, 43, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 20, 44
+; LE-NEXT:    rldicl 4, 4, 44, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 19, 45
+; LE-NEXT:    rldicl 4, 4, 45, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 18, 46
+; LE-NEXT:    rldicl 4, 4, 46, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 17, 47
+; LE-NEXT:    rldicl 4, 4, 47, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 16, 48
+; LE-NEXT:    rldicl 4, 4, 48, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 15, 49
+; LE-NEXT:    rldicl 4, 4, 49, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 14, 50
+; LE-NEXT:    rldicl 4, 4, 50, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 13, 51
+; LE-NEXT:    rldicl 4, 4, 51, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 12, 52
+; LE-NEXT:    rldicl 4, 4, 52, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 11, 53
+; LE-NEXT:    rldicl 4, 4, 53, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 10, 54
+; LE-NEXT:    rldicl 4, 4, 54, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 9, 55
+; LE-NEXT:    rldicl 4, 4, 55, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 8, 56
+; LE-NEXT:    rldicl 4, 4, 56, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 7, 57
+; LE-NEXT:    rldicl 4, 4, 57, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 6, 58
+; LE-NEXT:    rldicl 4, 4, 58, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 5, 59
+; LE-NEXT:    rldicl 4, 4, 59, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 4, 60
+; LE-NEXT:    rldicl 4, 4, 60, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 3, 61
+; LE-NEXT:    rldicl 4, 4, 61, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 2, 62
+; LE-NEXT:    rldicl 4, 4, 62, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicr 4, 9, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    rotlwi 5, 4, 24
+; LE-NEXT:    rlwimi 5, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 5, 4, 8, 24, 31
+; LE-NEXT:    rotlwi 4, 3, 24
+; LE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; LE-NEXT:    sldi 3, 4, 32
+; LE-NEXT:    or 3, 3, 5
+; LE-NEXT:    mtfprd 1, 3
+; LE-NEXT:    xxmrghd 34, 1, 0
+; LE-NEXT:    addi 1, 1, 752
 ; LE-NEXT:    blr
-  %a.ext = zext <8 x i16> %a to <8 x i32>
-  %b.ext = zext <8 x i16> %b to <8 x i32>
-  %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
-  %res.ext = lshr <8 x i32> %clmul, splat (i32 16)
-  %res = trunc <8x i32> %res.ext to <8 x i16>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
-; BE-LABEL: clmulh_v4i32:
-; BE:       # %bb.0:
-; BE-NEXT:    stdu 1, -1472(1)
-; BE-NEXT:    li 3, 1280
-; BE-NEXT:    vspltisb 12, -1
-; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1296
-; BE-NEXT:    vslw 15, 12, 12
-; BE-NEXT:    vspltisw 12, 12
-; BE-NEXT:    stvx 21, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1312
-; BE-NEXT:    vadduwm 17, 12, 12
-; BE-NEXT:    vspltisw 18, 8
-; BE-NEXT:    stvx 22, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1328
-; BE-NEXT:    vsrw 6, 2, 18
-; BE-NEXT:    vspltisw 19, 4
-; BE-NEXT:    stvx 23, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1344
-; BE-NEXT:    stvx 24, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1360
-; BE-NEXT:    stvx 25, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1376
-; BE-NEXT:    vsrw 9, 3, 18
-; BE-NEXT:    stvx 26, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1392
-; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1408
-; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1424
-; BE-NEXT:    vsrw 12, 2, 17
-; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1440
-; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1456
-; BE-NEXT:    vspltisw 30, 2
-; BE-NEXT:    vslw 14, 2, 17
-; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1264
-; BE-NEXT:    vspltisw 31, 1
-; BE-NEXT:    stvx 17, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI10_0 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_0 at toc@l
-; BE-NEXT:    lvx 29, 0, 3
-; BE-NEXT:    li 3, 1248
-; BE-NEXT:    vsrw 16, 3, 17
-; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1232
-; BE-NEXT:    vslw 17, 3, 17
-; BE-NEXT:    vand 2, 2, 29
-; BE-NEXT:    vand 3, 3, 29
-; BE-NEXT:    vand 6, 6, 29
-; BE-NEXT:    vand 9, 9, 29
-; BE-NEXT:    vslw 2, 2, 18
-; BE-NEXT:    vslw 3, 3, 18
-; BE-NEXT:    vor 6, 6, 12
-; BE-NEXT:    vspltisb 12, 15
-; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI10_1 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_1 at toc@l
-; BE-NEXT:    vor 9, 9, 16
-; BE-NEXT:    vor 2, 14, 2
-; BE-NEXT:    vor 3, 17, 3
-; BE-NEXT:    vor 2, 2, 6
-; BE-NEXT:    vor 3, 3, 9
-; BE-NEXT:    vand 6, 2, 12
-; BE-NEXT:    vsrw 2, 2, 19
-; BE-NEXT:    vand 9, 3, 12
-; BE-NEXT:    vsrw 3, 3, 19
-; BE-NEXT:    vand 2, 2, 12
-; BE-NEXT:    vand 3, 3, 12
-; BE-NEXT:    lvx 12, 0, 3
-; BE-NEXT:    li 3, 1216
-; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI10_2 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_2 at toc@l
-; BE-NEXT:    vslw 6, 6, 19
-; BE-NEXT:    vslw 9, 9, 19
-; BE-NEXT:    vor 2, 2, 6
-; BE-NEXT:    vor 3, 3, 9
-; BE-NEXT:    vand 6, 2, 12
-; BE-NEXT:    vsrw 2, 2, 30
-; BE-NEXT:    vand 9, 3, 12
-; BE-NEXT:    vsrw 3, 3, 30
-; BE-NEXT:    vand 2, 2, 12
-; BE-NEXT:    vand 3, 3, 12
-; BE-NEXT:    lvx 12, 0, 3
-; BE-NEXT:    li 3, 1200
-; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1136
-; BE-NEXT:    stvx 18, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI10_3 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_3 at toc@l
-; BE-NEXT:    vslw 6, 6, 30
-; BE-NEXT:    vslw 9, 9, 30
-; BE-NEXT:    vor 2, 2, 6
-; BE-NEXT:    vor 3, 3, 9
-; BE-NEXT:    vsrw 6, 2, 31
-; BE-NEXT:    vand 2, 2, 12
-; BE-NEXT:    vadduwm 2, 2, 2
-; BE-NEXT:    vsrw 9, 3, 31
-; BE-NEXT:    vand 3, 3, 12
-; BE-NEXT:    vand 6, 6, 12
-; BE-NEXT:    vand 12, 9, 12
-; BE-NEXT:    vor 9, 6, 2
-; BE-NEXT:    vadduwm 2, 3, 3
-; BE-NEXT:    vor 14, 12, 2
-; BE-NEXT:    vadduwm 2, 18, 18
-; BE-NEXT:    vand 28, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_4 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_4 at toc@l
-; BE-NEXT:    vand 27, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_5 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_5 at toc@l
-; BE-NEXT:    vand 25, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_6 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_6 at toc@l
-; BE-NEXT:    vslw 4, 19, 19
-; BE-NEXT:    vand 26, 14, 4
-; BE-NEXT:    vand 4, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_7 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_7 at toc@l
-; BE-NEXT:    vsldoi 5, 31, 31, 1
-; BE-NEXT:    vand 24, 14, 5
-; BE-NEXT:    vand 5, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_8 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_8 at toc@l
-; BE-NEXT:    vand 29, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_9 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_9 at toc@l
-; BE-NEXT:    vand 21, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_10 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_10 at toc@l
-; BE-NEXT:    vslw 7, 18, 18
-; BE-NEXT:    vand 3, 14, 7
-; BE-NEXT:    vand 7, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_11 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_11 at toc@l
-; BE-NEXT:    vsldoi 13, 18, 18, 2
-; BE-NEXT:    vand 16, 14, 13
-; BE-NEXT:    vand 13, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    addis 3, 2, .LCPI10_12 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_12 at toc@l
-; BE-NEXT:    vand 12, 14, 2
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 1184
-; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1168
-; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1152
-; BE-NEXT:    vsldoi 11, 31, 31, 2
-; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1072
-; BE-NEXT:    vsldoi 1, 19, 19, 1
-; BE-NEXT:    vsldoi 10, 30, 30, 2
-; BE-NEXT:    vand 20, 14, 11
-; BE-NEXT:    vand 11, 14, 2
-; BE-NEXT:    vsldoi 2, 31, 31, 3
-; BE-NEXT:    vsldoi 8, 19, 19, 2
-; BE-NEXT:    vand 22, 14, 1
-; BE-NEXT:    vand 1, 14, 10
-; BE-NEXT:    vand 10, 14, 2
-; BE-NEXT:    vsldoi 2, 30, 30, 3
-; BE-NEXT:    vand 17, 14, 8
-; BE-NEXT:    vand 8, 14, 2
-; BE-NEXT:    vsldoi 2, 19, 19, 3
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1040
-; BE-NEXT:    vsldoi 2, 18, 18, 3
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI10_13 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_13 at toc@l
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 1008
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI10_14 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_14 at toc@l
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 288
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    addis 3, 2, .LCPI10_15 at toc@ha
-; BE-NEXT:    addi 3, 3, .LCPI10_15 at toc@l
-; BE-NEXT:    lvx 2, 0, 3
-; BE-NEXT:    li 3, 192
-; BE-NEXT:    vand 2, 14, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 272
-; BE-NEXT:    vand 2, 14, 15
-; BE-NEXT:    vspltisw 15, -16
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 976
-; BE-NEXT:    vand 2, 14, 30
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 944
-; BE-NEXT:    vand 31, 14, 31
+  %a.ext = zext <2 x i64> %a to <2 x i128>
+  %b.ext = zext <2 x i64> %b to <2 x i128>
+  %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+  %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
+  %res = trunc <2 x i128> %res.ext to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; BE-LABEL: clmulh_v16i8:
+; BE:       # %bb.0:
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vspltisb 4, 4
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vsrb 1, 3, 4
+; BE-NEXT:    vspltisb 5, 15
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vspltisb 7, -1
 ; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 912
-; BE-NEXT:    vsldoi 0, 30, 30, 1
-; BE-NEXT:    vand 19, 14, 19
-; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 880
-; BE-NEXT:    vand 23, 14, 0
-; BE-NEXT:    vand 14, 14, 18
-; BE-NEXT:    stvx 14, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    vxor 6, 6, 6
-; BE-NEXT:    vrlw 0, 2, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1104
-; BE-NEXT:    vrlw 0, 31, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1088
-; BE-NEXT:    vrlw 0, 19, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1056
-; BE-NEXT:    vrlw 0, 14, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1024
-; BE-NEXT:    vrlw 0, 28, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 992
-; BE-NEXT:    vrlw 0, 27, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 960
-; BE-NEXT:    vrlw 0, 26, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 928
-; BE-NEXT:    vrlw 0, 25, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 896
-; BE-NEXT:    vrlw 0, 24, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 864
-; BE-NEXT:    vrlw 0, 23, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 832
-; BE-NEXT:    vrlw 0, 22, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 800
-; BE-NEXT:    vrlw 0, 3, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 768
-; BE-NEXT:    vrlw 0, 4, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 736
-; BE-NEXT:    vrlw 0, 5, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 704
-; BE-NEXT:    vrlw 0, 29, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 672
-; BE-NEXT:    vrlw 0, 21, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 640
-; BE-NEXT:    vrlw 0, 20, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 592
-; BE-NEXT:    vrlw 0, 1, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 560
-; BE-NEXT:    vrlw 0, 17, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 528
-; BE-NEXT:    vrlw 0, 16, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 496
-; BE-NEXT:    vrlw 0, 7, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 464
-; BE-NEXT:    vrlw 0, 13, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 432
-; BE-NEXT:    vrlw 0, 12, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 400
-; BE-NEXT:    vrlw 0, 11, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 368
-; BE-NEXT:    vrlw 0, 10, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 336
-; BE-NEXT:    vrlw 0, 8, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1072
-; BE-NEXT:    vmr 14, 7
-; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 304
-; BE-NEXT:    vrlw 0, 7, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1040
-; BE-NEXT:    vmr 30, 1
-; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 240
-; BE-NEXT:    vrlw 0, 1, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1008
-; BE-NEXT:    vmr 19, 5
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 208
-; BE-NEXT:    vrlw 0, 5, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 288
-; BE-NEXT:    vmr 18, 4
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 160
-; BE-NEXT:    vrlw 0, 4, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 192
-; BE-NEXT:    vmr 31, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 128
-; BE-NEXT:    vrlw 0, 3, 15
-; BE-NEXT:    vmsumuhm 2, 9, 0, 6
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 272
-; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 64
-; BE-NEXT:    vrlw 0, 2, 15
-; BE-NEXT:    vmsumuhm 0, 9, 0, 6
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 976
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 96
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 944
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 80
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 912
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 112
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 880
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 144
-; BE-NEXT:    vmulouh 0, 9, 0
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 176
-; BE-NEXT:    vmulouh 0, 9, 28
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 224
-; BE-NEXT:    vmulouh 0, 9, 27
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 256
-; BE-NEXT:    vmulouh 0, 9, 26
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 320
-; BE-NEXT:    vmulouh 0, 9, 25
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 352
-; BE-NEXT:    vmulouh 0, 9, 24
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 384
-; BE-NEXT:    vmulouh 0, 9, 23
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 416
-; BE-NEXT:    vmulouh 0, 9, 22
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 448
-; BE-NEXT:    vmulouh 0, 9, 31
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 480
-; BE-NEXT:    vmulouh 0, 9, 18
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 512
-; BE-NEXT:    vmulouh 0, 9, 19
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 544
-; BE-NEXT:    vmulouh 0, 9, 29
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 576
-; BE-NEXT:    vmulouh 0, 9, 21
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 608
-; BE-NEXT:    vmulouh 0, 9, 20
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 624
-; BE-NEXT:    vmulouh 0, 9, 30
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 656
-; BE-NEXT:    vmulouh 0, 9, 17
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 688
-; BE-NEXT:    vmulouh 0, 9, 16
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 720
-; BE-NEXT:    vmulouh 0, 9, 14
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 752
-; BE-NEXT:    vmulouh 0, 9, 13
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 784
-; BE-NEXT:    vmulouh 0, 9, 12
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 816
-; BE-NEXT:    vmulouh 0, 9, 11
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 848
-; BE-NEXT:    vmulouh 0, 9, 10
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 880
-; BE-NEXT:    vmulouh 0, 9, 8
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 912
-; BE-NEXT:    vmulouh 0, 9, 7
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 944
-; BE-NEXT:    vmulouh 0, 9, 1
-; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 976
-; BE-NEXT:    vmulouh 5, 9, 5
-; BE-NEXT:    stvx 5, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1008
-; BE-NEXT:    vmulouh 4, 9, 4
-; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1040
-; BE-NEXT:    vmulouh 3, 9, 3
-; BE-NEXT:    stvx 3, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1072
-; BE-NEXT:    vmulouh 2, 9, 2
-; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1104
-; BE-NEXT:    vslw 9, 2, 15
-; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1088
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1056
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1024
-; BE-NEXT:    vslw 2, 2, 15
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 992
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 960
-; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 928
-; BE-NEXT:    vslw 3, 3, 15
-; BE-NEXT:    lvx 6, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 896
-; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 864
-; BE-NEXT:    lvx 8, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 832
-; BE-NEXT:    vslw 4, 4, 15
-; BE-NEXT:    lvx 10, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 800
-; BE-NEXT:    lvx 11, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 768
-; BE-NEXT:    vslw 5, 5, 15
-; BE-NEXT:    lvx 12, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 736
-; BE-NEXT:    lvx 13, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 704
-; BE-NEXT:    lvx 14, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 672
-; BE-NEXT:    vslw 0, 0, 15
-; BE-NEXT:    lvx 16, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 640
-; BE-NEXT:    lvx 17, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 592
-; BE-NEXT:    lvx 18, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 560
-; BE-NEXT:    vslw 1, 1, 15
-; BE-NEXT:    lvx 19, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 528
+; BE-NEXT:    addis 3, 2, .LCPI8_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_0 at toc@l
+; BE-NEXT:    vand 3, 3, 5
+; BE-NEXT:    vspltisb 13, 8
+; BE-NEXT:    vslb 3, 3, 4
+; BE-NEXT:    vsrb 0, 2, 4
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    vor 1, 1, 3
+; BE-NEXT:    lvx 3, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI8_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_1 at toc@l
+; BE-NEXT:    vslb 2, 2, 4
+; BE-NEXT:    vor 0, 0, 2
+; BE-NEXT:    vspltisb 2, 2
+; BE-NEXT:    vsrb 9, 1, 2
+; BE-NEXT:    vand 1, 1, 3
+; BE-NEXT:    vand 9, 9, 3
+; BE-NEXT:    vslb 1, 1, 2
+; BE-NEXT:    vsrb 8, 0, 2
+; BE-NEXT:    vand 0, 0, 3
+; BE-NEXT:    vor 9, 9, 1
+; BE-NEXT:    lvx 1, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI8_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_3 at toc@l
+; BE-NEXT:    lvx 15, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI8_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_2 at toc@l
+; BE-NEXT:    vand 8, 8, 3
+; BE-NEXT:    vslb 0, 0, 2
+; BE-NEXT:    vor 8, 8, 0
+; BE-NEXT:    vspltisb 0, 1
+; BE-NEXT:    vsrb 11, 9, 0
+; BE-NEXT:    vand 9, 9, 1
+; BE-NEXT:    vaddubm 9, 9, 9
+; BE-NEXT:    vand 11, 11, 1
+; BE-NEXT:    vsrb 10, 8, 0
+; BE-NEXT:    vand 8, 8, 1
+; BE-NEXT:    vaddubm 8, 8, 8
+; BE-NEXT:    vor 9, 11, 9
+; BE-NEXT:    vslb 6, 4, 4
+; BE-NEXT:    vslb 7, 7, 7
+; BE-NEXT:    vand 10, 10, 1
+; BE-NEXT:    vand 14, 9, 13
+; BE-NEXT:    vaddubm 13, 13, 13
+; BE-NEXT:    vor 8, 10, 8
+; BE-NEXT:    vand 10, 9, 2
+; BE-NEXT:    vand 11, 9, 0
+; BE-NEXT:    vand 12, 9, 4
+; BE-NEXT:    vand 13, 9, 13
+; BE-NEXT:    vand 15, 9, 15
+; BE-NEXT:    vand 6, 9, 6
+; BE-NEXT:    vand 7, 9, 7
+; BE-NEXT:    vmuloub 9, 8, 10
+; BE-NEXT:    vmuleub 10, 8, 10
+; BE-NEXT:    vmuloub 16, 8, 11
+; BE-NEXT:    vmuleub 11, 8, 11
+; BE-NEXT:    vmuloub 17, 8, 12
+; BE-NEXT:    vmuleub 12, 8, 12
+; BE-NEXT:    vmuloub 18, 8, 14
+; BE-NEXT:    vmuleub 14, 8, 14
+; BE-NEXT:    vmuloub 19, 8, 13
+; BE-NEXT:    vmuleub 13, 8, 13
+; BE-NEXT:    vmuloub 31, 8, 15
+; BE-NEXT:    vmuleub 15, 8, 15
+; BE-NEXT:    vmuloub 30, 8, 6
+; BE-NEXT:    vmuleub 6, 8, 6
+; BE-NEXT:    vmuloub 29, 8, 7
+; BE-NEXT:    vmuleub 7, 8, 7
+; BE-NEXT:    lvx 8, 0, 3
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vperm 9, 10, 9, 8
+; BE-NEXT:    vperm 10, 11, 16, 8
+; BE-NEXT:    vperm 11, 12, 17, 8
+; BE-NEXT:    vperm 12, 14, 18, 8
+; BE-NEXT:    vperm 13, 13, 19, 8
+; BE-NEXT:    vperm 14, 15, 31, 8
 ; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 496
-; BE-NEXT:    vslw 6, 6, 15
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vperm 6, 6, 30, 8
 ; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 464
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vperm 7, 7, 29, 8
 ; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 432
+; BE-NEXT:    vxor 8, 10, 9
+; BE-NEXT:    vxor 8, 8, 11
+; BE-NEXT:    vxor 8, 8, 12
+; BE-NEXT:    vxor 8, 8, 13
+; BE-NEXT:    vxor 8, 8, 14
+; BE-NEXT:    vxor 6, 8, 6
+; BE-NEXT:    vxor 6, 6, 7
+; BE-NEXT:    vand 5, 6, 5
+; BE-NEXT:    vsrb 7, 6, 4
+; BE-NEXT:    vslb 4, 5, 4
+; BE-NEXT:    vor 4, 7, 4
+; BE-NEXT:    vand 5, 4, 3
+; BE-NEXT:    vsrb 4, 4, 2
+; BE-NEXT:    vslb 2, 5, 2
+; BE-NEXT:    vand 3, 4, 3
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    vsrb 3, 2, 0
+; BE-NEXT:    vand 2, 2, 1
+; BE-NEXT:    vaddubm 2, 2, 2
+; BE-NEXT:    vand 3, 3, 1
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    vsrb 2, 2, 0
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulh_v16i8:
+; LE:       # %bb.0:
+; LE-NEXT:    addis 3, 2, .LCPI8_0 at toc@ha
+; LE-NEXT:    vspltisb 4, 4
+; LE-NEXT:    vspltisb 5, 2
+; LE-NEXT:    addi 3, 3, .LCPI8_0 at toc@l
+; LE-NEXT:    vslb 1, 3, 4
+; LE-NEXT:    vsrb 3, 3, 4
+; LE-NEXT:    vslb 6, 2, 4
+; LE-NEXT:    vsrb 2, 2, 4
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI8_1 at toc@ha
+; LE-NEXT:    xxlor 35, 35, 33
+; LE-NEXT:    xxlor 34, 34, 38
+; LE-NEXT:    vspltisb 0, 1
+; LE-NEXT:    addi 3, 3, .LCPI8_1 at toc@l
+; LE-NEXT:    vsrb 1, 3, 5
+; LE-NEXT:    vsrb 7, 2, 5
+; LE-NEXT:    vspltisb 6, 8
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI8_2 at toc@ha
+; LE-NEXT:    xxland 35, 35, 0
+; LE-NEXT:    xxland 34, 34, 0
+; LE-NEXT:    xxland 2, 33, 0
+; LE-NEXT:    xxland 3, 39, 0
+; LE-NEXT:    addi 3, 3, .LCPI8_2 at toc@l
+; LE-NEXT:    vslb 3, 3, 5
+; LE-NEXT:    vslb 2, 2, 5
+; LE-NEXT:    xxlor 35, 2, 35
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    lxvd2x 3, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI8_3 at toc@ha
+; LE-NEXT:    vsrb 1, 3, 0
+; LE-NEXT:    xxland 35, 35, 1
+; LE-NEXT:    vsrb 7, 2, 0
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    addi 3, 3, .LCPI8_3 at toc@l
+; LE-NEXT:    xxland 2, 33, 1
+; LE-NEXT:    vaddubm 3, 3, 3
+; LE-NEXT:    vaddubm 2, 2, 2
+; LE-NEXT:    xxlor 2, 2, 35
+; LE-NEXT:    xxland 35, 2, 37
+; LE-NEXT:    xxswapd 33, 3
+; LE-NEXT:    xxland 3, 39, 1
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    lxvd2x 3, 0, 3
+; LE-NEXT:    vmuloub 7, 2, 3
+; LE-NEXT:    vmuleub 3, 2, 3
+; LE-NEXT:    vperm 3, 3, 7, 1
+; LE-NEXT:    xxland 39, 2, 32
+; LE-NEXT:    vmuloub 8, 2, 7
+; LE-NEXT:    vmuleub 7, 2, 7
+; LE-NEXT:    vperm 7, 7, 8, 1
+; LE-NEXT:    xxland 40, 2, 36
+; LE-NEXT:    vmuloub 9, 2, 8
+; LE-NEXT:    vmuleub 8, 2, 8
+; LE-NEXT:    vperm 8, 8, 9, 1
+; LE-NEXT:    xxland 41, 2, 38
+; LE-NEXT:    vaddubm 6, 6, 6
+; LE-NEXT:    vmuloub 10, 2, 9
+; LE-NEXT:    vmuleub 9, 2, 9
+; LE-NEXT:    xxland 38, 2, 38
+; LE-NEXT:    vperm 9, 9, 10, 1
+; LE-NEXT:    vmuloub 10, 2, 6
+; LE-NEXT:    vmuleub 6, 2, 6
+; LE-NEXT:    vperm 6, 6, 10, 1
+; LE-NEXT:    xxland 42, 2, 3
+; LE-NEXT:    vmuloub 11, 2, 10
+; LE-NEXT:    vmuleub 10, 2, 10
+; LE-NEXT:    vperm 10, 10, 11, 1
+; LE-NEXT:    vslb 11, 4, 4
+; LE-NEXT:    xxland 43, 2, 43
+; LE-NEXT:    vmuloub 12, 2, 11
+; LE-NEXT:    vmuleub 11, 2, 11
+; LE-NEXT:    vperm 11, 11, 12, 1
+; LE-NEXT:    xxleqv 44, 44, 44
+; LE-NEXT:    vslb 12, 12, 12
+; LE-NEXT:    xxland 44, 2, 44
+; LE-NEXT:    xxlxor 2, 39, 35
+; LE-NEXT:    xxlxor 2, 2, 40
+; LE-NEXT:    vmuloub 13, 2, 12
+; LE-NEXT:    vmuleub 2, 2, 12
+; LE-NEXT:    xxlxor 2, 2, 41
+; LE-NEXT:    xxlxor 2, 2, 38
+; LE-NEXT:    xxlxor 2, 2, 42
+; LE-NEXT:    xxlxor 2, 2, 43
+; LE-NEXT:    vperm 2, 2, 13, 1
+; LE-NEXT:    xxlxor 34, 2, 34
+; LE-NEXT:    vslb 3, 2, 4
+; LE-NEXT:    vsrb 2, 2, 4
+; LE-NEXT:    xxlor 34, 34, 35
+; LE-NEXT:    xxland 35, 34, 0
+; LE-NEXT:    vsrb 2, 2, 5
+; LE-NEXT:    vslb 3, 3, 5
+; LE-NEXT:    xxland 0, 34, 0
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    vsrb 3, 2, 0
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    xxland 0, 35, 1
+; LE-NEXT:    vaddubm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    vsrb 2, 2, 0
+; LE-NEXT:    blr
+  %a.ext = zext <16 x i8> %a to <16 x i16>
+  %b.ext = zext <16 x i8> %b to <16 x i16>
+  %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+  %res.ext = lshr <16 x i16> %clmul, splat (i16 8)
+  %res = trunc <16 x i16> %res.ext to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; BE-LABEL: clmulh_v8i16:
+; BE:       # %bb.0:
+; BE-NEXT:    li 3, -96
+; BE-NEXT:    vspltisb 1, -1
+; BE-NEXT:    vxor 5, 5, 5
+; BE-NEXT:    stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -80
+; BE-NEXT:    vspltish 0, 4
+; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -64
+; BE-NEXT:    vspltish 4, 1
+; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vslh 17, 1, 1
+; BE-NEXT:    vspltish 15, 2
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vspltish 16, 8
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
+; BE-NEXT:    vslh 6, 0, 0
+; BE-NEXT:    addi 3, 3, .LCPI9_0 at toc@l
+; BE-NEXT:    lvx 1, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_1 at toc@l
+; BE-NEXT:    vperm 7, 2, 2, 1
+; BE-NEXT:    vspltisb 2, 4
+; BE-NEXT:    vperm 8, 3, 3, 1
+; BE-NEXT:    vspltisb 3, 15
+; BE-NEXT:    vsrb 11, 7, 2
+; BE-NEXT:    vand 7, 7, 3
+; BE-NEXT:    vslb 7, 7, 2
+; BE-NEXT:    vsrb 13, 8, 2
+; BE-NEXT:    vand 8, 8, 3
+; BE-NEXT:    vor 11, 11, 7
+; BE-NEXT:    lvx 7, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_2 at toc@l
+; BE-NEXT:    vslb 8, 8, 2
+; BE-NEXT:    vor 13, 13, 8
+; BE-NEXT:    vspltisb 8, 2
+; BE-NEXT:    vand 19, 13, 7
+; BE-NEXT:    vsrb 13, 13, 8
+; BE-NEXT:    vslb 19, 19, 8
+; BE-NEXT:    vand 13, 13, 7
+; BE-NEXT:    vand 18, 11, 7
+; BE-NEXT:    vsrb 11, 11, 8
+; BE-NEXT:    vor 19, 13, 19
+; BE-NEXT:    lvx 13, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_3 at toc@l
+; BE-NEXT:    vslb 18, 18, 8
+; BE-NEXT:    vand 11, 11, 7
+; BE-NEXT:    vor 18, 11, 18
+; BE-NEXT:    vspltisb 11, 1
+; BE-NEXT:    vsrb 30, 19, 11
+; BE-NEXT:    vand 19, 19, 13
+; BE-NEXT:    vaddubm 19, 19, 19
+; BE-NEXT:    vand 30, 30, 13
+; BE-NEXT:    vor 19, 30, 19
+; BE-NEXT:    lvx 30, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_4 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_4 at toc@l
+; BE-NEXT:    lvx 29, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_5 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_5 at toc@l
+; BE-NEXT:    lvx 28, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_6 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_6 at toc@l
+; BE-NEXT:    vsrb 31, 18, 11
+; BE-NEXT:    lvx 27, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_7 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_7 at toc@l
+; BE-NEXT:    lvx 26, 0, 3
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vand 18, 18, 13
+; BE-NEXT:    vaddubm 18, 18, 18
+; BE-NEXT:    vand 31, 31, 13
+; BE-NEXT:    vsldoi 9, 4, 4, 1
+; BE-NEXT:    vsldoi 10, 15, 15, 1
+; BE-NEXT:    vsldoi 12, 0, 0, 1
+; BE-NEXT:    vslh 14, 16, 16
+; BE-NEXT:    vor 18, 31, 18
+; BE-NEXT:    vand 31, 19, 16
+; BE-NEXT:    vadduhm 16, 16, 16
+; BE-NEXT:    vand 15, 19, 15
+; BE-NEXT:    vand 0, 19, 0
+; BE-NEXT:    vand 16, 19, 16
+; BE-NEXT:    vand 30, 19, 30
+; BE-NEXT:    vand 6, 19, 6
+; BE-NEXT:    vand 29, 19, 29
+; BE-NEXT:    vand 9, 19, 9
+; BE-NEXT:    vand 10, 19, 10
+; BE-NEXT:    vand 12, 19, 12
+; BE-NEXT:    vand 14, 19, 14
+; BE-NEXT:    vand 28, 19, 28
+; BE-NEXT:    vand 27, 19, 27
+; BE-NEXT:    vand 26, 19, 26
+; BE-NEXT:    vand 17, 19, 17
+; BE-NEXT:    vand 19, 19, 4
+; BE-NEXT:    vmladduhm 15, 18, 15, 5
+; BE-NEXT:    vmladduhm 19, 18, 19, 5
+; BE-NEXT:    vmladduhm 0, 18, 0, 5
+; BE-NEXT:    vxor 15, 19, 15
+; BE-NEXT:    vmladduhm 31, 18, 31, 5
+; BE-NEXT:    vxor 0, 15, 0
+; BE-NEXT:    vmladduhm 16, 18, 16, 5
+; BE-NEXT:    vxor 0, 0, 31
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vmladduhm 30, 18, 30, 5
+; BE-NEXT:    vxor 0, 0, 16
+; BE-NEXT:    vmladduhm 6, 18, 6, 5
+; BE-NEXT:    vxor 0, 0, 30
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vmladduhm 29, 18, 29, 5
+; BE-NEXT:    vxor 0, 0, 6
+; BE-NEXT:    vmladduhm 9, 18, 9, 5
+; BE-NEXT:    vxor 0, 0, 29
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -64
+; BE-NEXT:    vmladduhm 10, 18, 10, 5
+; BE-NEXT:    vxor 0, 0, 9
+; BE-NEXT:    vmladduhm 12, 18, 12, 5
+; BE-NEXT:    vxor 0, 0, 10
+; BE-NEXT:    vmladduhm 14, 18, 14, 5
+; BE-NEXT:    vxor 0, 0, 12
+; BE-NEXT:    vmladduhm 28, 18, 28, 5
+; BE-NEXT:    vxor 0, 0, 14
+; BE-NEXT:    vmladduhm 27, 18, 27, 5
+; BE-NEXT:    vxor 0, 0, 28
 ; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 400
-; BE-NEXT:    vslw 7, 7, 15
+; BE-NEXT:    li 3, -80
+; BE-NEXT:    vmladduhm 26, 18, 26, 5
+; BE-NEXT:    vxor 0, 0, 27
 ; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 368
+; BE-NEXT:    li 3, -96
+; BE-NEXT:    vmladduhm 5, 18, 17, 5
+; BE-NEXT:    vxor 0, 0, 26
 ; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 336
-; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 304
-; BE-NEXT:    vslw 8, 8, 15
-; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 240
-; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 208
-; BE-NEXT:    vslw 10, 10, 15
-; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 160
-; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 128
-; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    vslw 11, 11, 15
-; BE-NEXT:    vslw 20, 20, 15
+; BE-NEXT:    vxor 5, 0, 5
+; BE-NEXT:    vperm 5, 5, 5, 1
+; BE-NEXT:    vand 3, 5, 3
+; BE-NEXT:    vsrb 0, 5, 2
+; BE-NEXT:    vslb 2, 3, 2
+; BE-NEXT:    vor 2, 0, 2
+; BE-NEXT:    vand 3, 2, 7
+; BE-NEXT:    vsrb 2, 2, 8
+; BE-NEXT:    vslb 3, 3, 8
+; BE-NEXT:    vand 2, 2, 7
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    vsrb 3, 2, 11
+; BE-NEXT:    vand 2, 2, 13
+; BE-NEXT:    vaddubm 2, 2, 2
+; BE-NEXT:    vand 3, 3, 13
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    vsrh 2, 2, 4
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulh_v8i16:
+; LE:       # %bb.0:
+; LE-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
+; LE-NEXT:    vspltisb 5, 4
+; LE-NEXT:    vspltish 9, 2
+; LE-NEXT:    addi 3, 3, .LCPI9_0 at toc@l
+; LE-NEXT:    vspltish 0, 1
+; LE-NEXT:    vspltish 6, 4
+; LE-NEXT:    vspltish 1, 8
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_1 at toc@ha
+; LE-NEXT:    vsldoi 11, 9, 9, 1
+; LE-NEXT:    addi 3, 3, .LCPI9_1 at toc@l
+; LE-NEXT:    vsldoi 13, 6, 6, 1
+; LE-NEXT:    vsldoi 10, 0, 0, 1
+; LE-NEXT:    xxswapd 36, 0
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_2 at toc@ha
+; LE-NEXT:    addi 3, 3, .LCPI9_2 at toc@l
+; LE-NEXT:    vperm 8, 2, 2, 4
+; LE-NEXT:    vperm 7, 3, 3, 4
+; LE-NEXT:    vspltisb 3, 2
+; LE-NEXT:    vspltisb 2, 1
+; LE-NEXT:    vslb 12, 8, 5
+; LE-NEXT:    vsrb 8, 8, 5
+; LE-NEXT:    xxlor 40, 40, 44
+; LE-NEXT:    xxland 44, 40, 0
+; LE-NEXT:    vsrb 8, 8, 3
+; LE-NEXT:    vslb 12, 12, 3
+; LE-NEXT:    xxland 1, 40, 0
+; LE-NEXT:    xxlor 40, 1, 44
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_3 at toc@ha
+; LE-NEXT:    vsrb 12, 8, 2
+; LE-NEXT:    addi 3, 3, .LCPI9_3 at toc@l
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_4 at toc@ha
+; LE-NEXT:    xxland 2, 44, 1
+; LE-NEXT:    vslb 12, 7, 5
+; LE-NEXT:    vsrb 7, 7, 5
+; LE-NEXT:    xxland 40, 40, 1
+; LE-NEXT:    addi 3, 3, .LCPI9_4 at toc@l
+; LE-NEXT:    xxlor 39, 39, 44
+; LE-NEXT:    vaddubm 8, 8, 8
+; LE-NEXT:    xxland 44, 39, 0
+; LE-NEXT:    vsrb 7, 7, 3
+; LE-NEXT:    xxlor 40, 2, 40
+; LE-NEXT:    vslb 12, 12, 3
+; LE-NEXT:    xxland 2, 39, 0
+; LE-NEXT:    xxlor 39, 2, 44
+; LE-NEXT:    vsrb 12, 7, 2
+; LE-NEXT:    xxland 39, 39, 1
+; LE-NEXT:    xxland 2, 44, 1
+; LE-NEXT:    vaddubm 7, 7, 7
+; LE-NEXT:    xxlor 2, 2, 39
+; LE-NEXT:    vxor 7, 7, 7
+; LE-NEXT:    xxland 41, 2, 41
+; LE-NEXT:    xxland 44, 2, 32
+; LE-NEXT:    vmladduhm 9, 8, 9, 7
+; LE-NEXT:    vmladduhm 12, 8, 12, 7
+; LE-NEXT:    xxlxor 3, 44, 41
+; LE-NEXT:    xxland 41, 2, 38
+; LE-NEXT:    vslh 6, 6, 6
+; LE-NEXT:    vmladduhm 9, 8, 9, 7
+; LE-NEXT:    xxland 38, 2, 38
+; LE-NEXT:    vmladduhm 6, 8, 6, 7
+; LE-NEXT:    xxlxor 3, 3, 41
+; LE-NEXT:    xxland 41, 2, 33
+; LE-NEXT:    vmladduhm 9, 8, 9, 7
+; LE-NEXT:    xxlxor 3, 3, 41
+; LE-NEXT:    vadduhm 9, 1, 1
+; LE-NEXT:    vslh 1, 1, 1
+; LE-NEXT:    xxland 41, 2, 41
+; LE-NEXT:    xxland 33, 2, 33
+; LE-NEXT:    vmladduhm 9, 8, 9, 7
+; LE-NEXT:    vmladduhm 1, 8, 1, 7
+; LE-NEXT:    xxlxor 3, 3, 41
+; LE-NEXT:    xxland 41, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_5 at toc@ha
+; LE-NEXT:    vmladduhm 9, 8, 9, 7
+; LE-NEXT:    addi 3, 3, .LCPI9_5 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 41
+; LE-NEXT:    xxlxor 3, 3, 38
+; LE-NEXT:    xxland 38, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_6 at toc@ha
+; LE-NEXT:    vmladduhm 6, 8, 6, 7
+; LE-NEXT:    addi 3, 3, .LCPI9_6 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 38
+; LE-NEXT:    xxland 38, 2, 42
+; LE-NEXT:    vmladduhm 6, 8, 6, 7
+; LE-NEXT:    xxlxor 3, 3, 38
+; LE-NEXT:    xxland 38, 2, 43
+; LE-NEXT:    vmladduhm 6, 8, 6, 7
+; LE-NEXT:    xxlxor 3, 3, 38
+; LE-NEXT:    xxland 38, 2, 45
+; LE-NEXT:    vmladduhm 6, 8, 6, 7
+; LE-NEXT:    xxlxor 3, 3, 38
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxland 33, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_7 at toc@ha
+; LE-NEXT:    vmladduhm 1, 8, 1, 7
+; LE-NEXT:    addi 3, 3, .LCPI9_7 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxland 33, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    vmladduhm 1, 8, 1, 7
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxland 33, 2, 4
+; LE-NEXT:    vmladduhm 1, 8, 1, 7
+; LE-NEXT:    xxlxor 3, 3, 33
+; LE-NEXT:    xxleqv 33, 33, 33
+; LE-NEXT:    vslh 1, 1, 1
+; LE-NEXT:    xxland 33, 2, 33
+; LE-NEXT:    vmladduhm 1, 8, 1, 7
+; LE-NEXT:    xxlxor 33, 3, 33
+; LE-NEXT:    vperm 4, 1, 1, 4
+; LE-NEXT:    vslb 1, 4, 5
+; LE-NEXT:    vsrb 4, 4, 5
+; LE-NEXT:    xxlor 36, 36, 33
+; LE-NEXT:    xxland 37, 36, 0
+; LE-NEXT:    vslb 5, 5, 3
+; LE-NEXT:    vsrb 3, 4, 3
+; LE-NEXT:    xxland 0, 35, 0
+; LE-NEXT:    xxlor 35, 0, 37
+; LE-NEXT:    vsrb 2, 3, 2
+; LE-NEXT:    xxland 0, 34, 1
+; LE-NEXT:    xxland 34, 35, 1
+; LE-NEXT:    vaddubm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    vsrh 2, 2, 0
+; LE-NEXT:    blr
+  %a.ext = zext <8 x i16> %a to <8 x i32>
+  %b.ext = zext <8 x i16> %b to <8 x i32>
+  %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+  %res.ext = lshr <8 x i32> %clmul, splat (i32 16)
+  %res = trunc <8x i32> %res.ext to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; BE-LABEL: clmulh_v4i32:
+; BE:       # %bb.0:
+; BE-NEXT:    stdu 1, -1456(1)
+; BE-NEXT:    li 3, 1264
+; BE-NEXT:    addi 4, 1, 1168
+; BE-NEXT:    vspltisw 4, 1
 ; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
-; BE-NEXT:    li 3, 64
-; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 96
-; BE-NEXT:    vslw 12, 12, 15
-; BE-NEXT:    vslw 13, 13, 15
-; BE-NEXT:    vslw 14, 14, 15
-; BE-NEXT:    vslw 16, 16, 15
-; BE-NEXT:    vslw 17, 17, 15
-; BE-NEXT:    vslw 18, 18, 15
-; BE-NEXT:    vslw 19, 19, 15
-; BE-NEXT:    vslw 31, 31, 15
-; BE-NEXT:    vslw 30, 30, 15
-; BE-NEXT:    vslw 29, 29, 15
-; BE-NEXT:    vslw 28, 28, 15
-; BE-NEXT:    vslw 27, 27, 15
-; BE-NEXT:    vslw 26, 26, 15
-; BE-NEXT:    vslw 25, 25, 15
-; BE-NEXT:    vslw 24, 24, 15
-; BE-NEXT:    vslw 23, 23, 15
-; BE-NEXT:    vslw 22, 22, 15
-; BE-NEXT:    vslw 21, 21, 15
-; BE-NEXT:    vslw 20, 20, 15
-; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 80
-; BE-NEXT:    vadduwm 9, 15, 9
-; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 112
-; BE-NEXT:    vadduwm 2, 15, 2
-; BE-NEXT:    vxor 2, 2, 9
-; BE-NEXT:    lvx 9, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 144
+; BE-NEXT:    vsldoi 13, 4, 4, 3
+; BE-NEXT:    li 3, 1280
+; BE-NEXT:    lis 5, -21846
+; BE-NEXT:    vspltisw 1, 2
+; BE-NEXT:    stvx 21, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1296
+; BE-NEXT:    lis 9, -13108
+; BE-NEXT:    ori 7, 5, 43690
+; BE-NEXT:    ori 5, 9, 52428
+; BE-NEXT:    vspltisw 8, 8
+; BE-NEXT:    stvx 22, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1312
+; BE-NEXT:    lis 6, 21845
+; BE-NEXT:    vslw 7, 8, 8
+; BE-NEXT:    stvx 23, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1328
+; BE-NEXT:    lis 10, 13107
+; BE-NEXT:    ori 8, 6, 21845
+; BE-NEXT:    ori 6, 10, 13107
+; BE-NEXT:    stvx 24, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1344
+; BE-NEXT:    lis 11, 3855
+; BE-NEXT:    stvx 25, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1360
+; BE-NEXT:    vsldoi 12, 8, 8, 2
+; BE-NEXT:    stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1376
+; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1392
+; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1408
+; BE-NEXT:    vsldoi 20, 8, 8, 3
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1424
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1440
+; BE-NEXT:    vsldoi 10, 1, 1, 2
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addi 3, 1, 1184
+; BE-NEXT:    stvx 3, 0, 3
+; BE-NEXT:    lis 3, -3856
+; BE-NEXT:    vspltisw 3, 4
+; BE-NEXT:    ori 3, 3, 61680
+; BE-NEXT:    stvx 2, 0, 4
+; BE-NEXT:    ori 4, 11, 3855
+; BE-NEXT:    li 11, 1152
+; BE-NEXT:    vspltisb 2, -1
+; BE-NEXT:    vslw 14, 3, 3
+; BE-NEXT:    lwz 9, 1196(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    vsldoi 16, 3, 3, 1
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    vsldoi 11, 3, 3, 2
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    vsldoi 21, 3, 3, 3
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1228(1)
+; BE-NEXT:    lwz 9, 1192(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    vsldoi 22, 1, 1, 3
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    vslw 2, 2, 2
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    vsldoi 9, 4, 4, 2
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1224(1)
+; BE-NEXT:    lwz 9, 1188(1)
+; BE-NEXT:    vsldoi 5, 4, 4, 1
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    vsldoi 17, 1, 1, 1
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1220(1)
+; BE-NEXT:    lwz 9, 1184(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1216(1)
+; BE-NEXT:    lwz 9, 1180(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1212(1)
+; BE-NEXT:    lwz 9, 1176(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1208(1)
+; BE-NEXT:    lwz 9, 1172(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    stvx 4, 1, 11 # 16-byte Folded Spill
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1204(1)
+; BE-NEXT:    lwz 9, 1168(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1200(1)
+; BE-NEXT:    addi 9, 1, 1216
+; BE-NEXT:    lvx 15, 0, 9
+; BE-NEXT:    li 9, 864
+; BE-NEXT:    vand 6, 15, 1
+; BE-NEXT:    stvx 6, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 832
+; BE-NEXT:    vand 0, 15, 3
+; BE-NEXT:    vadduwm 3, 8, 8
+; BE-NEXT:    stvx 0, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_0 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_0 at toc@l
+; BE-NEXT:    vand 30, 15, 3
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI10_1 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_1 at toc@l
+; BE-NEXT:    vand 28, 15, 3
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI10_2 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_2 at toc@l
+; BE-NEXT:    vand 26, 15, 3
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 800
+; BE-NEXT:    vand 27, 15, 14
+; BE-NEXT:    vand 14, 15, 7
+; BE-NEXT:    vand 7, 15, 3
+; BE-NEXT:    stvx 7, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_3 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_3 at toc@l
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 768
+; BE-NEXT:    vand 19, 15, 8
+; BE-NEXT:    vand 8, 15, 3
+; BE-NEXT:    stvx 8, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_4 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_4 at toc@l
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI10_5 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_5 at toc@l
+; BE-NEXT:    vand 23, 15, 3
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI10_6 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_6 at toc@l
+; BE-NEXT:    vand 31, 15, 3
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    addis 9, 2, .LCPI10_7 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_7 at toc@l
+; BE-NEXT:    vand 18, 15, 10
+; BE-NEXT:    vand 10, 15, 12
+; BE-NEXT:    vand 12, 15, 3
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 1088
+; BE-NEXT:    vand 3, 15, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_8 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_8 at toc@l
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 992
+; BE-NEXT:    vand 3, 15, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_9 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_9 at toc@l
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 896
+; BE-NEXT:    vand 3, 15, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 928
+; BE-NEXT:    vand 3, 15, 13
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 960
+; BE-NEXT:    vand 3, 15, 22
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1024
+; BE-NEXT:    vand 3, 15, 21
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1056
+; BE-NEXT:    vand 3, 15, 20
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_10 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_10 at toc@l
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 336
+; BE-NEXT:    vand 3, 15, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_11 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_11 at toc@l
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 256
+; BE-NEXT:    vand 3, 15, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addis 9, 2, .LCPI10_12 at toc@ha
+; BE-NEXT:    addi 9, 9, .LCPI10_12 at toc@l
+; BE-NEXT:    lvx 3, 0, 9
+; BE-NEXT:    li 9, 160
+; BE-NEXT:    vand 3, 15, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 240
+; BE-NEXT:    vand 2, 15, 2
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    addi 9, 1, 1200
+; BE-NEXT:    lvx 2, 0, 9
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    vand 29, 15, 16
+; BE-NEXT:    vand 16, 15, 9
+; BE-NEXT:    vspltisw 9, -16
+; BE-NEXT:    vand 25, 15, 5
+; BE-NEXT:    vand 20, 15, 4
+; BE-NEXT:    vxor 4, 4, 4
+; BE-NEXT:    vrlw 5, 6, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1120
+; BE-NEXT:    vrlw 5, 20, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1104
+; BE-NEXT:    vrlw 5, 0, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1072
+; BE-NEXT:    vrlw 5, 19, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1040
+; BE-NEXT:    vrlw 5, 30, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1008
+; BE-NEXT:    vrlw 5, 28, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 976
+; BE-NEXT:    vrlw 5, 27, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 944
+; BE-NEXT:    vrlw 5, 26, 9
+; BE-NEXT:    vand 24, 15, 17
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 912
+; BE-NEXT:    vrlw 5, 25, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 880
+; BE-NEXT:    vrlw 5, 24, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 848
+; BE-NEXT:    vrlw 5, 29, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 816
+; BE-NEXT:    vrlw 5, 14, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 784
+; BE-NEXT:    vrlw 5, 7, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 752
+; BE-NEXT:    vrlw 5, 8, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 720
+; BE-NEXT:    vrlw 5, 23, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 688
+; BE-NEXT:    vrlw 5, 31, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 656
+; BE-NEXT:    vrlw 5, 16, 9
+; BE-NEXT:    vand 11, 15, 11
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 608
+; BE-NEXT:    vrlw 5, 18, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 576
+; BE-NEXT:    vrlw 5, 11, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 544
+; BE-NEXT:    vrlw 5, 10, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 512
+; BE-NEXT:    vrlw 5, 12, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1088
+; BE-NEXT:    vmr 22, 30
+; BE-NEXT:    vmr 30, 14
+; BE-NEXT:    lvx 14, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 480
+; BE-NEXT:    vrlw 5, 14, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 992
+; BE-NEXT:    lvx 13, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 448
+; BE-NEXT:    vrlw 5, 13, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 896
+; BE-NEXT:    vmr 15, 12
+; BE-NEXT:    lvx 12, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 416
+; BE-NEXT:    vrlw 5, 12, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 928
+; BE-NEXT:    vmr 17, 11
+; BE-NEXT:    lvx 11, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 384
+; BE-NEXT:    vrlw 5, 11, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 960
+; BE-NEXT:    vmr 21, 19
+; BE-NEXT:    vmr 19, 16
+; BE-NEXT:    vmr 16, 10
+; BE-NEXT:    lvx 10, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 352
+; BE-NEXT:    vrlw 5, 10, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1024
+; BE-NEXT:    lvx 8, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 304
+; BE-NEXT:    vrlw 5, 8, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1056
+; BE-NEXT:    lvx 7, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 272
+; BE-NEXT:    vrlw 5, 7, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 336
+; BE-NEXT:    lvx 6, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 208
+; BE-NEXT:    vrlw 5, 6, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 256
+; BE-NEXT:    lvx 1, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 176
+; BE-NEXT:    vrlw 5, 1, 9
+; BE-NEXT:    vmsumuhm 3, 2, 5, 4
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 160
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 128
+; BE-NEXT:    vrlw 5, 3, 9
+; BE-NEXT:    vmsumuhm 5, 2, 5, 4
+; BE-NEXT:    stvx 5, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 240
+; BE-NEXT:    lvx 5, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 64
+; BE-NEXT:    vrlw 0, 5, 9
+; BE-NEXT:    vmsumuhm 4, 2, 0, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 864
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 96
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 80
+; BE-NEXT:    vmulouh 4, 2, 20
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 832
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 112
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 144
+; BE-NEXT:    vmulouh 4, 2, 21
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 192
+; BE-NEXT:    vmulouh 4, 2, 22
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 224
+; BE-NEXT:    vmulouh 4, 2, 28
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 288
+; BE-NEXT:    vmulouh 4, 2, 27
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 320
+; BE-NEXT:    vmulouh 4, 2, 26
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 368
+; BE-NEXT:    vmulouh 4, 2, 25
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 400
+; BE-NEXT:    vmulouh 4, 2, 24
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 432
+; BE-NEXT:    vmulouh 4, 2, 29
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 464
+; BE-NEXT:    vmulouh 4, 2, 30
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 800
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 496
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 768
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 528
+; BE-NEXT:    vmulouh 4, 2, 4
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 560
+; BE-NEXT:    vmulouh 4, 2, 23
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 592
+; BE-NEXT:    vmulouh 4, 2, 31
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 624
+; BE-NEXT:    vmulouh 4, 2, 19
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 640
+; BE-NEXT:    vmulouh 4, 2, 18
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 672
+; BE-NEXT:    vmulouh 4, 2, 17
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 704
+; BE-NEXT:    vmulouh 4, 2, 16
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 736
+; BE-NEXT:    vmulouh 4, 2, 15
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 768
+; BE-NEXT:    vmulouh 4, 2, 14
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 800
+; BE-NEXT:    vmulouh 4, 2, 13
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 832
+; BE-NEXT:    vmulouh 4, 2, 12
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 864
+; BE-NEXT:    vmulouh 4, 2, 11
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 896
+; BE-NEXT:    vmulouh 4, 2, 10
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 928
+; BE-NEXT:    vmulouh 4, 2, 8
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 960
+; BE-NEXT:    vmulouh 4, 2, 7
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 992
+; BE-NEXT:    vmulouh 4, 2, 6
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1024
+; BE-NEXT:    vmulouh 4, 2, 1
+; BE-NEXT:    stvx 4, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1056
+; BE-NEXT:    vmulouh 3, 2, 3
+; BE-NEXT:    stvx 3, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1088
+; BE-NEXT:    vmulouh 2, 2, 5
+; BE-NEXT:    stvx 2, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    lvx 2, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1120
+; BE-NEXT:    vslw 3, 2, 9
+; BE-NEXT:    lvx 2, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1104
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1072
+; BE-NEXT:    lvx 5, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1040
+; BE-NEXT:    vslw 2, 2, 9
+; BE-NEXT:    lvx 0, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1008
+; BE-NEXT:    lvx 1, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 976
+; BE-NEXT:    lvx 6, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 944
+; BE-NEXT:    vslw 4, 4, 9
+; BE-NEXT:    lvx 7, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 912
+; BE-NEXT:    lvx 8, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 880
+; BE-NEXT:    lvx 10, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 848
+; BE-NEXT:    vslw 5, 5, 9
+; BE-NEXT:    lvx 11, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 816
+; BE-NEXT:    lvx 12, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 784
+; BE-NEXT:    vslw 0, 0, 9
+; BE-NEXT:    lvx 13, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 752
+; BE-NEXT:    lvx 14, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 720
+; BE-NEXT:    lvx 15, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 688
+; BE-NEXT:    vslw 1, 1, 9
+; BE-NEXT:    lvx 16, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 656
+; BE-NEXT:    lvx 17, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 608
+; BE-NEXT:    lvx 18, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 576
+; BE-NEXT:    vslw 6, 6, 9
+; BE-NEXT:    lvx 19, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 544
+; BE-NEXT:    lvx 31, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 512
+; BE-NEXT:    vslw 7, 7, 9
+; BE-NEXT:    lvx 30, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 480
+; BE-NEXT:    lvx 29, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 448
+; BE-NEXT:    lvx 28, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 416
+; BE-NEXT:    vslw 8, 8, 9
+; BE-NEXT:    lvx 27, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 384
+; BE-NEXT:    lvx 26, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 352
+; BE-NEXT:    lvx 25, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 304
+; BE-NEXT:    vslw 10, 10, 9
+; BE-NEXT:    lvx 24, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 272
+; BE-NEXT:    lvx 23, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 208
+; BE-NEXT:    vslw 11, 11, 9
+; BE-NEXT:    lvx 22, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 176
+; BE-NEXT:    lvx 21, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 128
+; BE-NEXT:    lvx 20, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    vslw 12, 12, 9
+; BE-NEXT:    vslw 20, 20, 9
+; BE-NEXT:    stvx 20, 1, 9 # 16-byte Folded Spill
+; BE-NEXT:    li 9, 64
+; BE-NEXT:    lvx 20, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 96
+; BE-NEXT:    vslw 13, 13, 9
+; BE-NEXT:    vslw 14, 14, 9
+; BE-NEXT:    vslw 15, 15, 9
+; BE-NEXT:    vslw 16, 16, 9
+; BE-NEXT:    vslw 17, 17, 9
+; BE-NEXT:    vslw 18, 18, 9
+; BE-NEXT:    vslw 19, 19, 9
+; BE-NEXT:    vslw 31, 31, 9
+; BE-NEXT:    vslw 30, 30, 9
+; BE-NEXT:    vslw 29, 29, 9
+; BE-NEXT:    vslw 28, 28, 9
+; BE-NEXT:    vslw 27, 27, 9
+; BE-NEXT:    vslw 26, 26, 9
+; BE-NEXT:    vslw 25, 25, 9
+; BE-NEXT:    vslw 24, 24, 9
+; BE-NEXT:    vslw 23, 23, 9
+; BE-NEXT:    vslw 22, 22, 9
+; BE-NEXT:    vslw 21, 21, 9
+; BE-NEXT:    vslw 20, 20, 9
+; BE-NEXT:    lvx 9, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 80
 ; BE-NEXT:    vadduwm 3, 9, 3
+; BE-NEXT:    lvx 9, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 112
+; BE-NEXT:    vadduwm 2, 9, 2
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 176
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 144
 ; BE-NEXT:    vadduwm 3, 3, 4
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 224
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 192
 ; BE-NEXT:    vadduwm 3, 3, 5
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 256
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 224
 ; BE-NEXT:    vadduwm 3, 3, 0
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 320
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 288
 ; BE-NEXT:    vadduwm 3, 3, 1
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 352
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 320
 ; BE-NEXT:    vadduwm 3, 3, 6
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 384
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 368
 ; BE-NEXT:    vadduwm 3, 3, 7
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 416
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 400
 ; BE-NEXT:    vadduwm 3, 3, 8
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 448
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 432
 ; BE-NEXT:    vadduwm 3, 3, 10
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 480
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 464
 ; BE-NEXT:    vadduwm 3, 3, 11
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 512
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 496
 ; BE-NEXT:    vadduwm 3, 3, 12
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 544
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 528
 ; BE-NEXT:    vadduwm 3, 3, 13
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 576
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 560
 ; BE-NEXT:    vadduwm 3, 3, 14
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 608
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 592
+; BE-NEXT:    vadduwm 3, 3, 15
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 624
 ; BE-NEXT:    vadduwm 3, 3, 16
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 624
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 640
 ; BE-NEXT:    vadduwm 3, 3, 17
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 656
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 672
 ; BE-NEXT:    vadduwm 3, 3, 18
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 688
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 704
 ; BE-NEXT:    vadduwm 3, 3, 19
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 720
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 736
 ; BE-NEXT:    vadduwm 3, 3, 31
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 752
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 768
 ; BE-NEXT:    vadduwm 3, 3, 30
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 784
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 800
 ; BE-NEXT:    vadduwm 3, 3, 29
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 816
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 832
 ; BE-NEXT:    vadduwm 3, 3, 28
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 848
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 864
 ; BE-NEXT:    vadduwm 3, 3, 27
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 880
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 896
 ; BE-NEXT:    vadduwm 3, 3, 26
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 912
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 928
 ; BE-NEXT:    vadduwm 3, 3, 25
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 944
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 960
 ; BE-NEXT:    vadduwm 3, 3, 24
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 976
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 992
 ; BE-NEXT:    vadduwm 3, 3, 23
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1008
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1024
 ; BE-NEXT:    vadduwm 3, 3, 22
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1040
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1056
 ; BE-NEXT:    vadduwm 3, 3, 21
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1120
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1072
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1136
+; BE-NEXT:    lvx 4, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    li 9, 1088
 ; BE-NEXT:    vadduwm 3, 3, 4
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1264
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1136
+; BE-NEXT:    lvx 3, 1, 9 # 16-byte Folded Reload
+; BE-NEXT:    addi 9, 1, 1232
 ; BE-NEXT:    vadduwm 3, 3, 20
-; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1248
 ; BE-NEXT:    vxor 2, 2, 3
-; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1232
-; BE-NEXT:    vsrw 3, 2, 5
-; BE-NEXT:    vsrw 4, 2, 1
-; BE-NEXT:    vslw 5, 2, 5
-; BE-NEXT:    vand 2, 2, 0
-; BE-NEXT:    vslw 2, 2, 1
-; BE-NEXT:    vand 4, 4, 0
-; BE-NEXT:    vor 2, 5, 2
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    stvx 2, 0, 9
+; BE-NEXT:    lwz 9, 1244(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1260(1)
+; BE-NEXT:    lwz 9, 1240(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1256(1)
+; BE-NEXT:    lwz 9, 1236(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 10, 10, 7
+; BE-NEXT:    and 9, 9, 8
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 2
+; BE-NEXT:    srwi 9, 9, 2
+; BE-NEXT:    and 10, 10, 5
+; BE-NEXT:    and 9, 9, 6
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    slwi 10, 9, 4
+; BE-NEXT:    srwi 9, 9, 4
+; BE-NEXT:    and 10, 10, 3
+; BE-NEXT:    and 9, 9, 4
+; BE-NEXT:    or 9, 9, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    rldicl 9, 10, 0, 32
+; BE-NEXT:    stw 9, 1252(1)
+; BE-NEXT:    lwz 9, 1232(1)
+; BE-NEXT:    slwi 10, 9, 1
+; BE-NEXT:    srwi 9, 9, 1
+; BE-NEXT:    and 7, 10, 7
+; BE-NEXT:    and 8, 9, 8
+; BE-NEXT:    or 7, 8, 7
+; BE-NEXT:    slwi 8, 7, 2
+; BE-NEXT:    srwi 7, 7, 2
+; BE-NEXT:    and 5, 8, 5
+; BE-NEXT:    and 6, 7, 6
+; BE-NEXT:    or 5, 6, 5
+; BE-NEXT:    slwi 6, 5, 4
+; BE-NEXT:    srwi 5, 5, 4
+; BE-NEXT:    and 3, 6, 3
+; BE-NEXT:    and 4, 5, 4
+; BE-NEXT:    or 3, 4, 3
+; BE-NEXT:    rotlwi 4, 3, 24
+; BE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 4, 0, 32
+; BE-NEXT:    stw 3, 1248(1)
+; BE-NEXT:    addi 3, 1, 1248
+; BE-NEXT:    lvx 2, 0, 3
 ; BE-NEXT:    li 3, 1152
-; BE-NEXT:    vor 3, 4, 3
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1216
-; BE-NEXT:    vor 2, 2, 3
-; BE-NEXT:    vand 3, 2, 5
-; BE-NEXT:    vsrw 2, 2, 4
-; BE-NEXT:    vand 2, 2, 5
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1168
-; BE-NEXT:    vslw 3, 3, 4
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1184
-; BE-NEXT:    vor 2, 2, 3
-; BE-NEXT:    vand 3, 2, 5
-; BE-NEXT:    vsrw 2, 2, 4
-; BE-NEXT:    vslw 3, 3, 4
-; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1200
-; BE-NEXT:    vand 2, 2, 5
-; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    li 3, 1456
-; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1440
-; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1424
-; BE-NEXT:    vor 2, 2, 3
-; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    vsrw 2, 2, 3
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1408
-; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1392
-; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1376
-; BE-NEXT:    vsrw 3, 2, 4
-; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1360
-; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1344
-; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1328
-; BE-NEXT:    vand 2, 2, 5
-; BE-NEXT:    vadduwm 2, 2, 2
-; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1312
-; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1296
-; BE-NEXT:    vand 3, 3, 5
-; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
 ; BE-NEXT:    li 3, 1280
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1264
 ; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
-; BE-NEXT:    vor 2, 3, 2
-; BE-NEXT:    vsrw 2, 2, 4
-; BE-NEXT:    addi 1, 1, 1472
+; BE-NEXT:    addi 1, 1, 1456
 ; BE-NEXT:    blr
 ;
 ; LE-LABEL: clmulh_v4i32:
 ; LE:       # %bb.0:
-; LE-NEXT:    addis 3, 2, .LCPI10_0 at toc@ha
-; LE-NEXT:    vspltisw 7, 12
-; LE-NEXT:    vspltisw 4, 8
-; LE-NEXT:    addi 3, 3, .LCPI10_0 at toc@l
-; LE-NEXT:    vadduwm 7, 7, 7
-; LE-NEXT:    vsrw 18, 2, 4
-; LE-NEXT:    vspltisb 5, 15
+; LE-NEXT:    xxsldwi 0, 34, 34, 1
+; LE-NEXT:    lis 5, -13108
+; LE-NEXT:    lis 9, 13107
+; LE-NEXT:    xxswapd 1, 34
+; LE-NEXT:    lis 4, 21845
+; LE-NEXT:    lis 10, -3856
+; LE-NEXT:    lis 3, -21846
+; LE-NEXT:    xxsldwi 2, 35, 35, 1
+; LE-NEXT:    ori 6, 5, 52428
+; LE-NEXT:    ori 5, 9, 13107
+; LE-NEXT:    mffprwz 9, 0
+; LE-NEXT:    ori 7, 4, 21845
+; LE-NEXT:    ori 4, 10, 61680
+; LE-NEXT:    mffprwz 10, 1
+; LE-NEXT:    ori 8, 3, 43690
+; LE-NEXT:    lis 11, 3855
+; LE-NEXT:    ori 3, 11, 3855
+; LE-NEXT:    mffprwz 11, 2
+; LE-NEXT:    xxswapd 3, 35
+; LE-NEXT:    mffprwz 12, 3
+; LE-NEXT:    xxsldwi 4, 35, 35, 3
+; LE-NEXT:    xxsldwi 5, 34, 34, 3
+; LE-NEXT:    vspltisw 4, 2
 ; LE-NEXT:    vspltisw 0, 4
-; LE-NEXT:    lxvd2x 0, 0, 3
-; LE-NEXT:    vsrw 17, 2, 7
-; LE-NEXT:    addis 3, 2, .LCPI10_1 at toc@ha
-; LE-NEXT:    vspltisw 6, 2
-; LE-NEXT:    vspltisw 1, 1
+; LE-NEXT:    vspltisw 5, 8
+; LE-NEXT:    slwi 0, 9, 1
+; LE-NEXT:    srwi 9, 9, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 9, 9, 7
+; LE-NEXT:    vsldoi 9, 4, 4, 1
 ; LE-NEXT:    vsldoi 10, 0, 0, 1
-; LE-NEXT:    addi 3, 3, .LCPI10_1 at toc@l
-; LE-NEXT:    vsldoi 13, 0, 0, 2
-; LE-NEXT:    vsldoi 9, 6, 6, 1
-; LE-NEXT:    vsldoi 12, 6, 6, 2
-; LE-NEXT:    vsldoi 14, 4, 4, 2
-; LE-NEXT:    vsldoi 16, 6, 6, 3
-; LE-NEXT:    vsldoi 8, 1, 1, 1
-; LE-NEXT:    vsldoi 11, 1, 1, 2
-; LE-NEXT:    vsldoi 15, 1, 1, 3
-; LE-NEXT:    xxland 1, 50, 0
-; LE-NEXT:    xxlor 1, 1, 49
-; LE-NEXT:    vslw 17, 2, 7
-; LE-NEXT:    xxland 34, 34, 0
-; LE-NEXT:    vslw 2, 2, 4
-; LE-NEXT:    xxlor 2, 49, 34
-; LE-NEXT:    xxlor 34, 2, 1
-; LE-NEXT:    xxland 50, 34, 37
-; LE-NEXT:    vsrw 2, 2, 0
-; LE-NEXT:    vslw 18, 18, 0
-; LE-NEXT:    xxland 1, 34, 37
-; LE-NEXT:    xxlor 34, 1, 50
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_2 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI10_2 at toc@l
-; LE-NEXT:    xxland 51, 34, 1
-; LE-NEXT:    vsrw 2, 2, 6
-; LE-NEXT:    vslw 19, 19, 6
-; LE-NEXT:    xxland 2, 34, 1
-; LE-NEXT:    xxlor 34, 2, 51
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_3 at toc@ha
-; LE-NEXT:    vsrw 19, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI10_3 at toc@l
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_4 at toc@ha
-; LE-NEXT:    xxland 34, 34, 2
-; LE-NEXT:    xxland 3, 51, 2
-; LE-NEXT:    vsrw 19, 3, 4
-; LE-NEXT:    addi 3, 3, .LCPI10_4 at toc@l
-; LE-NEXT:    vadduwm 2, 2, 2
-; LE-NEXT:    xxlor 34, 3, 34
-; LE-NEXT:    xxland 3, 51, 0
-; LE-NEXT:    vsrw 19, 3, 7
-; LE-NEXT:    xxlor 3, 3, 51
-; LE-NEXT:    vslw 19, 3, 7
-; LE-NEXT:    xxland 35, 35, 0
-; LE-NEXT:    vsldoi 17, 0, 0, 3
-; LE-NEXT:    vslw 3, 3, 4
-; LE-NEXT:    xxlor 4, 51, 35
-; LE-NEXT:    xxlor 35, 4, 3
-; LE-NEXT:    xxland 51, 35, 37
-; LE-NEXT:    vsrw 3, 3, 0
-; LE-NEXT:    vslw 19, 19, 0
-; LE-NEXT:    xxland 3, 35, 37
-; LE-NEXT:    xxlor 35, 3, 51
-; LE-NEXT:    xxland 51, 35, 1
-; LE-NEXT:    vsrw 3, 3, 6
-; LE-NEXT:    vslw 19, 19, 6
-; LE-NEXT:    xxland 3, 35, 1
-; LE-NEXT:    xxlor 35, 3, 51
-; LE-NEXT:    vsrw 19, 3, 1
-; LE-NEXT:    xxland 35, 35, 2
-; LE-NEXT:    xxland 3, 51, 2
-; LE-NEXT:    vadduwm 3, 3, 3
-; LE-NEXT:    xxlor 3, 3, 35
-; LE-NEXT:    xxland 35, 3, 38
-; LE-NEXT:    xxland 51, 3, 33
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    vmuluwm 19, 2, 19
-; LE-NEXT:    xxlxor 4, 51, 35
-; LE-NEXT:    xxland 35, 3, 32
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 36
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    vadduwm 3, 4, 4
-; LE-NEXT:    xxland 35, 3, 35
-; LE-NEXT:    vsldoi 18, 4, 4, 3
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_5 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_5 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    vslw 3, 0, 0
-; LE-NEXT:    xxland 35, 3, 35
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_6 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_6 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 40
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 41
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 42
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    vslw 3, 4, 4
-; LE-NEXT:    xxland 35, 3, 35
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_7 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_7 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_8 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_8 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_9 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_9 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_10 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_10 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 43
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 44
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 45
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 46
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_11 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_11 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_12 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_12 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_13 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_13 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_14 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_14 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 47
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 48
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 49
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 50
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI10_15 at toc@ha
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    addi 3, 3, .LCPI10_15 at toc@l
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxland 35, 3, 5
-; LE-NEXT:    vmuluwm 3, 2, 3
-; LE-NEXT:    xxlxor 4, 4, 35
-; LE-NEXT:    xxleqv 35, 35, 35
-; LE-NEXT:    vslw 3, 3, 3
-; LE-NEXT:    xxland 35, 3, 35
-; LE-NEXT:    vmuluwm 2, 2, 3
-; LE-NEXT:    xxlxor 34, 4, 34
-; LE-NEXT:    vsrw 8, 2, 4
-; LE-NEXT:    vsrw 3, 2, 7
-; LE-NEXT:    xxland 3, 40, 0
-; LE-NEXT:    xxlor 3, 3, 35
-; LE-NEXT:    vslw 3, 2, 7
-; LE-NEXT:    xxland 34, 34, 0
-; LE-NEXT:    vslw 2, 2, 4
-; LE-NEXT:    xxlor 0, 35, 34
-; LE-NEXT:    xxlor 34, 0, 3
-; LE-NEXT:    xxland 35, 34, 37
-; LE-NEXT:    vsrw 2, 2, 0
-; LE-NEXT:    vslw 3, 3, 0
-; LE-NEXT:    xxland 0, 34, 37
-; LE-NEXT:    xxlor 34, 0, 35
-; LE-NEXT:    xxland 35, 34, 1
-; LE-NEXT:    vsrw 2, 2, 6
-; LE-NEXT:    vslw 3, 3, 6
-; LE-NEXT:    xxland 0, 34, 1
-; LE-NEXT:    xxlor 34, 0, 35
-; LE-NEXT:    vsrw 3, 2, 1
-; LE-NEXT:    xxland 34, 34, 2
-; LE-NEXT:    xxland 0, 35, 2
-; LE-NEXT:    vadduwm 2, 2, 2
-; LE-NEXT:    xxlor 34, 0, 34
-; LE-NEXT:    vsrw 2, 2, 1
+; LE-NEXT:    vsldoi 6, 4, 4, 2
+; LE-NEXT:    vsldoi 7, 0, 0, 2
+; LE-NEXT:    vsldoi 8, 5, 5, 2
+; LE-NEXT:    or 9, 9, 0
+; LE-NEXT:    slwi 0, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 0
+; LE-NEXT:    slwi 0, 11, 1
+; LE-NEXT:    srwi 11, 11, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 11, 11, 7
+; LE-NEXT:    or 11, 11, 0
+; LE-NEXT:    slwi 0, 12, 1
+; LE-NEXT:    srwi 12, 12, 1
+; LE-NEXT:    and 0, 0, 8
+; LE-NEXT:    and 12, 12, 7
+; LE-NEXT:    or 12, 12, 0
+; LE-NEXT:    slwi 0, 9, 2
+; LE-NEXT:    srwi 9, 9, 2
+; LE-NEXT:    and 0, 0, 6
+; LE-NEXT:    and 9, 9, 5
+; LE-NEXT:    or 9, 9, 0
+; LE-NEXT:    slwi 0, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 0, 0, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 0
+; LE-NEXT:    slwi 0, 11, 2
+; LE-NEXT:    srwi 11, 11, 2
+; LE-NEXT:    and 0, 0, 6
+; LE-NEXT:    and 11, 11, 5
+; LE-NEXT:    or 11, 11, 0
+; LE-NEXT:    slwi 0, 9, 4
+; LE-NEXT:    srwi 9, 9, 4
+; LE-NEXT:    and 0, 0, 4
+; LE-NEXT:    and 9, 9, 3
+; LE-NEXT:    or 9, 9, 0
+; LE-NEXT:    slwi 0, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 0, 0, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 0
+; LE-NEXT:    slwi 0, 11, 4
+; LE-NEXT:    srwi 11, 11, 4
+; LE-NEXT:    and 0, 0, 4
+; LE-NEXT:    and 11, 11, 3
+; LE-NEXT:    or 11, 11, 0
+; LE-NEXT:    rotlwi 0, 9, 24
+; LE-NEXT:    rlwimi 0, 9, 8, 8, 15
+; LE-NEXT:    rlwimi 0, 9, 8, 24, 31
+; LE-NEXT:    rotlwi 9, 10, 24
+; LE-NEXT:    rlwimi 9, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 9, 10, 8, 24, 31
+; LE-NEXT:    rotlwi 10, 11, 24
+; LE-NEXT:    rlwimi 10, 11, 8, 8, 15
+; LE-NEXT:    rlwimi 10, 11, 8, 24, 31
+; LE-NEXT:    rldicl 11, 0, 0, 32
+; LE-NEXT:    rldicl 0, 9, 0, 32
+; LE-NEXT:    mffprwz 9, 5
+; LE-NEXT:    rldicl 10, 10, 0, 32
+; LE-NEXT:    rldimi 0, 11, 32, 0
+; LE-NEXT:    slwi 11, 12, 2
+; LE-NEXT:    srwi 12, 12, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 12, 12, 5
+; LE-NEXT:    mtfprd 1, 0
+; LE-NEXT:    or 11, 12, 11
+; LE-NEXT:    slwi 12, 11, 4
+; LE-NEXT:    srwi 11, 11, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 11, 11, 3
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    rotlwi 12, 11, 24
+; LE-NEXT:    rlwimi 12, 11, 8, 8, 15
+; LE-NEXT:    rlwimi 12, 11, 8, 24, 31
+; LE-NEXT:    rldicl 11, 12, 0, 32
+; LE-NEXT:    rldimi 11, 10, 32, 0
+; LE-NEXT:    mffprwz 10, 4
+; LE-NEXT:    mtfprd 0, 11
+; LE-NEXT:    mfvsrwz 11, 35
+; LE-NEXT:    slwi 12, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    rotlwi 12, 10, 24
+; LE-NEXT:    rlwimi 12, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 12, 10, 8, 24, 31
+; LE-NEXT:    slwi 10, 11, 1
+; LE-NEXT:    srwi 11, 11, 1
+; LE-NEXT:    rldicl 12, 12, 0, 32
+; LE-NEXT:    and 10, 10, 8
+; LE-NEXT:    and 11, 11, 7
+; LE-NEXT:    or 10, 11, 10
+; LE-NEXT:    slwi 11, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    slwi 11, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 11, 11, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    rotlwi 11, 10, 24
+; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
+; LE-NEXT:    mfvsrwz 10, 34
+; LE-NEXT:    rldicl 11, 11, 0, 32
+; LE-NEXT:    rldimi 11, 12, 32, 0
+; LE-NEXT:    mtfprd 2, 11
+; LE-NEXT:    slwi 11, 9, 1
+; LE-NEXT:    srwi 9, 9, 1
+; LE-NEXT:    and 11, 11, 8
+; LE-NEXT:    and 9, 9, 7
+; LE-NEXT:    or 9, 9, 11
+; LE-NEXT:    slwi 11, 9, 2
+; LE-NEXT:    srwi 9, 9, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 9, 9, 5
+; LE-NEXT:    or 9, 9, 11
+; LE-NEXT:    slwi 11, 9, 4
+; LE-NEXT:    srwi 9, 9, 4
+; LE-NEXT:    and 11, 11, 4
+; LE-NEXT:    and 9, 9, 3
+; LE-NEXT:    or 9, 9, 11
+; LE-NEXT:    rotlwi 11, 9, 24
+; LE-NEXT:    rlwimi 11, 9, 8, 8, 15
+; LE-NEXT:    rlwimi 11, 9, 8, 24, 31
+; LE-NEXT:    rldicl 9, 11, 0, 32
+; LE-NEXT:    slwi 11, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 11, 11, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    slwi 11, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 11, 11, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    slwi 11, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 11, 11, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 11
+; LE-NEXT:    rotlwi 11, 10, 24
+; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
+; LE-NEXT:    rldicl 10, 11, 0, 32
+; LE-NEXT:    addis 11, 2, .LCPI10_12 at toc@ha
+; LE-NEXT:    rldimi 10, 9, 32, 0
+; LE-NEXT:    addis 9, 2, .LCPI10_0 at toc@ha
+; LE-NEXT:    addi 9, 9, .LCPI10_0 at toc@l
+; LE-NEXT:    vspltisw 2, 1
+; LE-NEXT:    vsldoi 11, 2, 2, 1
+; LE-NEXT:    vsldoi 1, 2, 2, 2
+; LE-NEXT:    xxmrghd 0, 2, 0
+; LE-NEXT:    mtfprd 2, 10
+; LE-NEXT:    xxland 44, 0, 36
+; LE-NEXT:    xxland 45, 0, 34
+; LE-NEXT:    xxland 43, 0, 43
+; LE-NEXT:    xxland 41, 0, 41
+; LE-NEXT:    addis 10, 2, .LCPI10_11 at toc@ha
+; LE-NEXT:    addi 10, 10, .LCPI10_11 at toc@l
+; LE-NEXT:    lxvd2x 3, 0, 10
+; LE-NEXT:    xxmrghd 35, 2, 1
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_1 at toc@ha
+; LE-NEXT:    vmuluwm 12, 3, 12
+; LE-NEXT:    vmuluwm 13, 3, 13
+; LE-NEXT:    addi 9, 9, .LCPI10_1 at toc@l
+; LE-NEXT:    vmuluwm 11, 3, 11
+; LE-NEXT:    vmuluwm 9, 3, 9
+; LE-NEXT:    xxlxor 1, 45, 44
+; LE-NEXT:    xxland 44, 0, 32
+; LE-NEXT:    vmuluwm 12, 3, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxland 44, 0, 37
+; LE-NEXT:    vmuluwm 12, 3, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    vadduwm 12, 5, 5
+; LE-NEXT:    xxland 44, 0, 44
+; LE-NEXT:    vmuluwm 12, 3, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxland 44, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_2 at toc@ha
+; LE-NEXT:    vmuluwm 12, 3, 12
+; LE-NEXT:    addi 9, 9, .LCPI10_2 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    vslw 12, 0, 0
+; LE-NEXT:    xxland 44, 0, 44
+; LE-NEXT:    vmuluwm 12, 3, 12
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxland 44, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_3 at toc@ha
+; LE-NEXT:    vmuluwm 12, 3, 12
+; LE-NEXT:    addi 9, 9, .LCPI10_3 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 44
+; LE-NEXT:    xxlxor 1, 1, 43
+; LE-NEXT:    xxlxor 1, 1, 41
+; LE-NEXT:    xxland 41, 0, 42
+; LE-NEXT:    vslw 10, 5, 5
+; LE-NEXT:    vmuluwm 9, 3, 9
+; LE-NEXT:    xxlxor 1, 1, 41
+; LE-NEXT:    vsldoi 11, 2, 2, 3
+; LE-NEXT:    vsldoi 5, 5, 5, 3
+; LE-NEXT:    vsldoi 9, 4, 4, 3
+; LE-NEXT:    xxland 36, 0, 42
+; LE-NEXT:    vmuluwm 4, 3, 4
+; LE-NEXT:    xxlxor 1, 1, 36
+; LE-NEXT:    vsldoi 4, 0, 0, 3
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_4 at toc@ha
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    addi 9, 9, .LCPI10_4 at toc@l
+; LE-NEXT:    xxland 36, 0, 36
+; LE-NEXT:    vmuluwm 4, 3, 4
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_5 at toc@ha
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    addi 9, 9, .LCPI10_5 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_6 at toc@ha
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    addi 9, 9, .LCPI10_6 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_7 at toc@ha
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    addi 9, 9, .LCPI10_7 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 33
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 38
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 39
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 40
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_8 at toc@ha
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    addi 9, 9, .LCPI10_8 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_9 at toc@ha
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    addi 9, 9, .LCPI10_9 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addis 9, 2, .LCPI10_10 at toc@ha
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    addi 9, 9, .LCPI10_10 at toc@l
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 2
+; LE-NEXT:    lxvd2x 2, 0, 9
+; LE-NEXT:    addi 9, 11, .LCPI10_12 at toc@l
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    lxvd2x 4, 0, 9
+; LE-NEXT:    xxland 33, 0, 4
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 43
+; LE-NEXT:    vmuluwm 1, 3, 1
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 41
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    xxlxor 1, 1, 32
+; LE-NEXT:    xxland 32, 0, 3
+; LE-NEXT:    xxlxor 1, 1, 36
+; LE-NEXT:    xxland 36, 0, 37
+; LE-NEXT:    xxland 37, 0, 2
+; LE-NEXT:    vmuluwm 0, 3, 0
+; LE-NEXT:    vmuluwm 4, 3, 4
+; LE-NEXT:    vmuluwm 5, 3, 5
+; LE-NEXT:    xxlxor 1, 1, 36
+; LE-NEXT:    xxleqv 36, 36, 36
+; LE-NEXT:    vslw 4, 4, 4
+; LE-NEXT:    xxland 36, 0, 36
+; LE-NEXT:    xxlxor 0, 1, 37
+; LE-NEXT:    vmuluwm 3, 3, 4
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxlxor 0, 0, 33
+; LE-NEXT:    xxlxor 0, 0, 35
+; LE-NEXT:    xxsldwi 1, 0, 0, 1
+; LE-NEXT:    xxswapd 2, 0
+; LE-NEXT:    xxsldwi 3, 0, 0, 3
+; LE-NEXT:    mffprwz 9, 1
+; LE-NEXT:    mffprwz 10, 2
+; LE-NEXT:    mffprwz 11, 3
+; LE-NEXT:    slwi 12, 9, 1
+; LE-NEXT:    srwi 9, 9, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 9, 9, 7
+; LE-NEXT:    or 9, 9, 12
+; LE-NEXT:    slwi 12, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 10, 10, 7
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 11, 1
+; LE-NEXT:    srwi 11, 11, 1
+; LE-NEXT:    and 12, 12, 8
+; LE-NEXT:    and 11, 11, 7
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    slwi 12, 9, 2
+; LE-NEXT:    srwi 9, 9, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 9, 9, 5
+; LE-NEXT:    or 9, 9, 12
+; LE-NEXT:    slwi 12, 10, 2
+; LE-NEXT:    srwi 10, 10, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 10, 10, 5
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 11, 2
+; LE-NEXT:    srwi 11, 11, 2
+; LE-NEXT:    and 12, 12, 6
+; LE-NEXT:    and 11, 11, 5
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    slwi 12, 9, 4
+; LE-NEXT:    srwi 9, 9, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 9, 9, 3
+; LE-NEXT:    or 9, 9, 12
+; LE-NEXT:    slwi 12, 10, 4
+; LE-NEXT:    srwi 10, 10, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 10, 10, 3
+; LE-NEXT:    or 10, 10, 12
+; LE-NEXT:    slwi 12, 11, 4
+; LE-NEXT:    srwi 11, 11, 4
+; LE-NEXT:    and 12, 12, 4
+; LE-NEXT:    and 11, 11, 3
+; LE-NEXT:    or 11, 11, 12
+; LE-NEXT:    rotlwi 12, 9, 24
+; LE-NEXT:    rlwimi 12, 9, 8, 8, 15
+; LE-NEXT:    rlwimi 12, 9, 8, 24, 31
+; LE-NEXT:    rotlwi 9, 10, 24
+; LE-NEXT:    rlwimi 9, 10, 8, 8, 15
+; LE-NEXT:    rlwimi 9, 10, 8, 24, 31
+; LE-NEXT:    rotlwi 10, 11, 24
+; LE-NEXT:    rldicl 9, 9, 0, 32
+; LE-NEXT:    rlwimi 10, 11, 8, 8, 15
+; LE-NEXT:    rlwimi 10, 11, 8, 24, 31
+; LE-NEXT:    rldicl 11, 12, 0, 32
+; LE-NEXT:    rldimi 9, 11, 32, 0
+; LE-NEXT:    mtfprd 1, 9
+; LE-NEXT:    rldicl 9, 10, 0, 32
+; LE-NEXT:    mffprwz 10, 0
+; LE-NEXT:    slwi 11, 10, 1
+; LE-NEXT:    srwi 10, 10, 1
+; LE-NEXT:    and 8, 11, 8
+; LE-NEXT:    and 7, 10, 7
+; LE-NEXT:    or 7, 7, 8
+; LE-NEXT:    slwi 8, 7, 2
+; LE-NEXT:    srwi 7, 7, 2
+; LE-NEXT:    and 6, 8, 6
+; LE-NEXT:    and 5, 7, 5
+; LE-NEXT:    or 5, 5, 6
+; LE-NEXT:    slwi 6, 5, 4
+; LE-NEXT:    srwi 5, 5, 4
+; LE-NEXT:    and 4, 6, 4
+; LE-NEXT:    and 3, 5, 3
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rotlwi 4, 3, 24
+; LE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; LE-NEXT:    rldicl 3, 4, 0, 32
+; LE-NEXT:    rldimi 3, 9, 32, 0
+; LE-NEXT:    mtfprd 0, 3
+; LE-NEXT:    xxmrghd 35, 0, 1
+; LE-NEXT:    vsrw 2, 3, 2
 ; LE-NEXT:    blr
   %a.ext = zext <4 x i32> %a to <4 x i64>
   %b.ext = zext <4 x i32> %b to <4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
index 0d5c78b6d779a..3bcbc9a72c5cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
@@ -3469,139 +3469,138 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32V-NEXT:    lui a6, 8192
 ; RV32V-NEXT:    lui a5, 16384
 ; RV32V-NEXT:    lui a3, 32768
-; RV32V-NEXT:    sw a1, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw a1, 16(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw t5, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw t5, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a4, 260(sp)
+; RV32V-NEXT:    sw a4, 268(sp)
 ; RV32V-NEXT:    lui a4, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a2, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw a2, 260(sp)
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s11, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s11, 252(sp)
 ; RV32V-NEXT:    vsetvli s11, zero, e64, m1, ta, ma
 ; RV32V-NEXT:    vand.vi v13, v9, 2
 ; RV32V-NEXT:    vand.vi v14, v9, 1
 ; RV32V-NEXT:    vand.vi v12, v9, 4
 ; RV32V-NEXT:    vand.vi v11, v9, 8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw a0, 236(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw a0, 244(sp)
 ; RV32V-NEXT:    vand.vx v10, v9, a0
-; RV32V-NEXT:    addi s11, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw ra, 228(sp)
+; RV32V-NEXT:    addi s11, sp, 16
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw ra, 236(sp)
 ; RV32V-NEXT:    vand.vx v15, v9, ra
-; RV32V-NEXT:    addi ra, sp, 264
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s10, 220(sp)
+; RV32V-NEXT:    addi ra, sp, 272
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s10, 228(sp)
 ; RV32V-NEXT:    vand.vx v16, v9, s10
-; RV32V-NEXT:    addi s10, sp, 256
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s9, 212(sp)
+; RV32V-NEXT:    addi s10, sp, 264
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
 ; RV32V-NEXT:    vand.vx v17, v9, s9
-; RV32V-NEXT:    addi s9, sp, 248
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s8, 204(sp)
+; RV32V-NEXT:    addi s9, sp, 256
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s8, 212(sp)
 ; RV32V-NEXT:    vand.vx v18, v9, s8
-; RV32V-NEXT:    addi s8, sp, 240
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s7, 196(sp)
+; RV32V-NEXT:    addi s8, sp, 248
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s7, 204(sp)
 ; RV32V-NEXT:    vand.vx v19, v9, s7
-; RV32V-NEXT:    addi s7, sp, 232
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s1, 188(sp)
+; RV32V-NEXT:    addi s7, sp, 240
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s1, 196(sp)
 ; RV32V-NEXT:    vand.vx v20, v9, s1
 ; RV32V-NEXT:    slli t5, t5, 11
 ; RV32V-NEXT:    vand.vx v21, v9, s6
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw t5, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw t5, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s6, 172(sp)
-; RV32V-NEXT:    addi s6, sp, 216
+; RV32V-NEXT:    sw s6, 180(sp)
+; RV32V-NEXT:    addi s6, sp, 224
 ; RV32V-NEXT:    vand.vx v22, v9, s5
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s5, 164(sp)
-; RV32V-NEXT:    addi s5, sp, 208
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s5, 172(sp)
+; RV32V-NEXT:    addi s5, sp, 216
 ; RV32V-NEXT:    vand.vx v23, v9, s4
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s4, 156(sp)
-; RV32V-NEXT:    addi s4, sp, 200
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw s4, 164(sp)
+; RV32V-NEXT:    addi s4, sp, 208
 ; RV32V-NEXT:    vand.vx v24, v9, s3
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s3, 148(sp)
-; RV32V-NEXT:    addi s3, sp, 192
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s3, 156(sp)
+; RV32V-NEXT:    addi s3, sp, 200
 ; RV32V-NEXT:    vand.vx v25, v9, s2
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s2, 140(sp)
-; RV32V-NEXT:    addi s2, sp, 184
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw s2, 148(sp)
+; RV32V-NEXT:    addi s2, sp, 192
 ; RV32V-NEXT:    vand.vx v26, v9, s0
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
-; RV32V-NEXT:    addi s1, sp, 176
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw s0, 140(sp)
+; RV32V-NEXT:    addi s1, sp, 184
 ; RV32V-NEXT:    vand.vx v27, v9, t6
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
-; RV32V-NEXT:    addi s0, sp, 168
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t6, 132(sp)
+; RV32V-NEXT:    addi s0, sp, 176
 ; RV32V-NEXT:    vand.vx v28, v9, t4
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    addi t6, sp, 160
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t4, 124(sp)
+; RV32V-NEXT:    addi t6, sp, 168
 ; RV32V-NEXT:    vand.vx v29, v9, t3
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    addi t4, sp, 152
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t3, 116(sp)
+; RV32V-NEXT:    addi t4, sp, 160
 ; RV32V-NEXT:    vand.vx v30, v9, t2
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    addi t3, sp, 144
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
+; RV32V-NEXT:    addi t3, sp, 152
 ; RV32V-NEXT:    vand.vx v31, v9, t1
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    addi t2, sp, 136
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
+; RV32V-NEXT:    addi t2, sp, 144
 ; RV32V-NEXT:    vand.vx v7, v9, t0
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    addi t1, sp, 128
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
+; RV32V-NEXT:    addi t1, sp, 136
 ; RV32V-NEXT:    vand.vx v6, v9, a7
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    addi t0, sp, 120
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a7, 84(sp)
+; RV32V-NEXT:    addi t0, sp, 128
 ; RV32V-NEXT:    vand.vx v5, v9, a6
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    addi a7, sp, 112
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a6, 76(sp)
+; RV32V-NEXT:    addi a7, sp, 120
 ; RV32V-NEXT:    vand.vx v4, v9, a5
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    addi a6, sp, 104
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a5, 68(sp)
+; RV32V-NEXT:    addi a6, sp, 112
 ; RV32V-NEXT:    vand.vx v3, v9, a3
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a3, 52(sp)
-; RV32V-NEXT:    addi a5, sp, 96
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a3, 60(sp)
+; RV32V-NEXT:    addi a5, sp, 104
 ; RV32V-NEXT:    vand.vx v2, v9, a4
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a4, 44(sp)
-; RV32V-NEXT:    addi a4, sp, 88
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw a4, 52(sp)
+; RV32V-NEXT:    addi a4, sp, 96
 ; RV32V-NEXT:    vand.vx v1, v9, a2
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw a2, 44(sp)
+; RV32V-NEXT:    addi a3, sp, 88
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a2, 36(sp)
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    sw zero, 24(sp)
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a1, 20(sp)
-; RV32V-NEXT:    addi a2, sp, 72
+; RV32V-NEXT:    sw a0, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw a1, 28(sp)
+; RV32V-NEXT:    addi a2, sp, 80
 ; RV32V-NEXT:    vand.vx v0, v9, t5
-; RV32V-NEXT:    addi a1, sp, 64
+; RV32V-NEXT:    addi a1, sp, 72
 ; RV32V-NEXT:    vmul.vv v13, v8, v13
 ; RV32V-NEXT:    vmul.vv v14, v8, v14
-; RV32V-NEXT:    vxor.vi v14, v14, 0
 ; RV32V-NEXT:    vxor.vv v14, v14, v13
 ; RV32V-NEXT:    vlse64.v v13, (s11), zero
-; RV32V-NEXT:    addi s11, sp, 56
+; RV32V-NEXT:    addi s11, sp, 64
 ; RV32V-NEXT:    vmul.vv v12, v8, v12
 ; RV32V-NEXT:    vxor.vv v14, v14, v12
 ; RV32V-NEXT:    vlse64.v v12, (ra), zero
@@ -3613,7 +3612,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vs1r.v v12, (t5) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi ra, sp, 48
+; RV32V-NEXT:    addi ra, sp, 56
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v14, v14, v11
 ; RV32V-NEXT:    vlse64.v v11, (s10), zero
@@ -3623,7 +3622,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s10, sp, 40
+; RV32V-NEXT:    addi s10, sp, 48
 ; RV32V-NEXT:    vmul.vv v10, v8, v10
 ; RV32V-NEXT:    vxor.vv v14, v14, v10
 ; RV32V-NEXT:    vlse64.v v10, (s9), zero
@@ -3632,7 +3631,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vs1r.v v10, (t5) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi t5, sp, 32
+; RV32V-NEXT:    addi t5, sp, 40
 ; RV32V-NEXT:    vmul.vv v15, v8, v15
 ; RV32V-NEXT:    vxor.vv v15, v14, v15
 ; RV32V-NEXT:    vlse64.v v10, (s8), zero
@@ -3642,7 +3641,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32V-NEXT:    add s8, sp, s8
 ; RV32V-NEXT:    addi s8, s8, 288
 ; RV32V-NEXT:    vs1r.v v10, (s8) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s8, sp, 24
+; RV32V-NEXT:    addi s8, sp, 32
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vxor.vv v16, v15, v16
 ; RV32V-NEXT:    vlse64.v v10, (s7), zero
@@ -3651,7 +3650,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32V-NEXT:    add s7, sp, s7
 ; RV32V-NEXT:    addi s7, s7, 288
 ; RV32V-NEXT:    vs1r.v v10, (s7) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 16
+; RV32V-NEXT:    addi s7, sp, 24
 ; RV32V-NEXT:    vmul.vv v17, v8, v17
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vmul.vv v19, v8, v19
@@ -3676,7 +3675,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v16, v16, v17
-; RV32V-NEXT:    addi s9, sp, 224
+; RV32V-NEXT:    addi s9, sp, 232
 ; RV32V-NEXT:    vlse64.v v11, (s9), zero
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    vlse64.v v10, (s6), zero
@@ -4275,159 +4274,158 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vx(<vscale x 1 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32V-NEXT:    vlse64.v v9, (s4), zero
 ; RV32V-NEXT:    lui s4, 32768
-; RV32V-NEXT:    sw s2, 272(sp)
+; RV32V-NEXT:    sw s2, 16(sp)
 ; RV32V-NEXT:    lui a7, 524288
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s11, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s11, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s6, 260(sp)
+; RV32V-NEXT:    sw s6, 268(sp)
 ; RV32V-NEXT:    lui s6, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s8, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s8, 260(sp)
 ; RV32V-NEXT:    lui s8, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s10, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s10, 252(sp)
 ; RV32V-NEXT:    lui a1, 262144
-; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
 ; RV32V-NEXT:    li s2, 16
-; RV32V-NEXT:    sw s2, 236(sp)
+; RV32V-NEXT:    sw s2, 244(sp)
 ; RV32V-NEXT:    li s10, 16
-; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw zero, 232(sp)
 ; RV32V-NEXT:    li s2, 32
-; RV32V-NEXT:    sw s2, 228(sp)
+; RV32V-NEXT:    sw s2, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw ra, 228(sp)
 ; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw ra, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s9, 212(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
 ; RV32V-NEXT:    li s2, 128
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s7, 212(sp)
 ; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s7, 204(sp)
+; RV32V-NEXT:    sw s5, 204(sp)
 ; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s5, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s3, 188(sp)
+; RV32V-NEXT:    sw s3, 196(sp)
 ; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s11, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s11, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s1, 172(sp)
+; RV32V-NEXT:    sw s1, 180(sp)
 ; RV32V-NEXT:    lui s3, 1
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s0, 164(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s0, 172(sp)
 ; RV32V-NEXT:    lui s1, 2
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t6, 156(sp)
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw t6, 164(sp)
 ; RV32V-NEXT:    lui s0, 4
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw t5, 156(sp)
 ; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t5, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t4, 140(sp)
+; RV32V-NEXT:    sw t4, 148(sp)
 ; RV32V-NEXT:    lui t6, 16
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t3, 132(sp)
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw t3, 140(sp)
 ; RV32V-NEXT:    lui t4, 32
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t2, 124(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t2, 132(sp)
 ; RV32V-NEXT:    lui t3, 64
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t1, 116(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t1, 124(sp)
 ; RV32V-NEXT:    lui t2, 128
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t0, 116(sp)
 ; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t0, 108(sp)
+; RV32V-NEXT:    sw a6, 108(sp)
 ; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw a6, 100(sp)
+; RV32V-NEXT:    sw a5, 100(sp)
 ; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw a5, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a4, 84(sp)
+; RV32V-NEXT:    sw a4, 92(sp)
 ; RV32V-NEXT:    lui t1, 2048
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a3, 84(sp)
 ; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a3, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a2, 68(sp)
+; RV32V-NEXT:    sw a2, 76(sp)
 ; RV32V-NEXT:    lui a4, 8192
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a0, 60(sp)
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a0, 68(sp)
 ; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw s4, 60(sp)
 ; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw s4, 52(sp)
+; RV32V-NEXT:    sw s6, 52(sp)
 ; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s6, 44(sp)
+; RV32V-NEXT:    sw s8, 44(sp)
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw s8, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a1, 28(sp)
+; RV32V-NEXT:    sw a1, 36(sp)
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a7, 20(sp)
-; RV32V-NEXT:    addi a1, sp, 272
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw a7, 28(sp)
+; RV32V-NEXT:    addi a1, sp, 16
 ; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 264
+; RV32V-NEXT:    addi a1, sp, 272
 ; RV32V-NEXT:    vlse64.v v10, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 256
+; RV32V-NEXT:    addi a1, sp, 264
 ; RV32V-NEXT:    vlse64.v v13, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 248
+; RV32V-NEXT:    addi a1, sp, 256
 ; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 240
+; RV32V-NEXT:    addi a1, sp, 248
 ; RV32V-NEXT:    vlse64.v v15, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 232
+; RV32V-NEXT:    addi a1, sp, 240
 ; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 224
+; RV32V-NEXT:    addi a1, sp, 232
 ; RV32V-NEXT:    vlse64.v v17, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 216
+; RV32V-NEXT:    addi a1, sp, 224
 ; RV32V-NEXT:    vlse64.v v18, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 208
+; RV32V-NEXT:    addi a1, sp, 216
 ; RV32V-NEXT:    vlse64.v v19, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 200
+; RV32V-NEXT:    addi a1, sp, 208
 ; RV32V-NEXT:    vlse64.v v20, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 192
+; RV32V-NEXT:    addi a1, sp, 200
 ; RV32V-NEXT:    vlse64.v v21, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 184
+; RV32V-NEXT:    addi a1, sp, 192
 ; RV32V-NEXT:    vlse64.v v22, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 176
+; RV32V-NEXT:    addi a1, sp, 184
 ; RV32V-NEXT:    vlse64.v v23, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 168
+; RV32V-NEXT:    addi a1, sp, 176
 ; RV32V-NEXT:    vlse64.v v24, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 160
+; RV32V-NEXT:    addi a1, sp, 168
 ; RV32V-NEXT:    vlse64.v v25, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 152
+; RV32V-NEXT:    addi a1, sp, 160
 ; RV32V-NEXT:    vlse64.v v26, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 144
+; RV32V-NEXT:    addi a1, sp, 152
 ; RV32V-NEXT:    vlse64.v v27, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 136
+; RV32V-NEXT:    addi a1, sp, 144
 ; RV32V-NEXT:    vlse64.v v28, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 128
+; RV32V-NEXT:    addi a1, sp, 136
 ; RV32V-NEXT:    vlse64.v v29, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 120
+; RV32V-NEXT:    addi a1, sp, 128
 ; RV32V-NEXT:    vlse64.v v30, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 112
+; RV32V-NEXT:    addi a1, sp, 120
 ; RV32V-NEXT:    vlse64.v v31, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 104
+; RV32V-NEXT:    addi a1, sp, 112
 ; RV32V-NEXT:    vlse64.v v7, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 96
+; RV32V-NEXT:    addi a1, sp, 104
 ; RV32V-NEXT:    vlse64.v v6, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 88
+; RV32V-NEXT:    addi a1, sp, 96
 ; RV32V-NEXT:    vlse64.v v5, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 80
+; RV32V-NEXT:    addi a1, sp, 88
 ; RV32V-NEXT:    vlse64.v v3, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 72
+; RV32V-NEXT:    addi a1, sp, 80
 ; RV32V-NEXT:    vlse64.v v12, (a1), zero
 ; RV32V-NEXT:    csrr a1, vlenb
 ; RV32V-NEXT:    slli a1, a1, 2
 ; RV32V-NEXT:    add a1, sp, a1
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs1r.v v12, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 64
+; RV32V-NEXT:    addi a1, sp, 72
 ; RV32V-NEXT:    vlse64.v v4, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 56
+; RV32V-NEXT:    addi a1, sp, 64
 ; RV32V-NEXT:    vlse64.v v2, (a1), zero
-; RV32V-NEXT:    addi ra, sp, 48
+; RV32V-NEXT:    addi ra, sp, 56
 ; RV32V-NEXT:    vand.vi v1, v9, 2
 ; RV32V-NEXT:    vand.vi v0, v9, 1
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
 ; RV32V-NEXT:    vxor.vv v1, v0, v1
 ; RV32V-NEXT:    vand.vi v0, v9, 4
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
@@ -4436,21 +4434,21 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vx(<vscale x 1 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v1, v1, v0
 ; RV32V-NEXT:    vand.vx v0, v9, s10
-; RV32V-NEXT:    addi s10, sp, 40
+; RV32V-NEXT:    addi s10, sp, 48
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v1, v1, v0
 ; RV32V-NEXT:    li a1, 32
 ; RV32V-NEXT:    vand.vx v0, v9, a1
-; RV32V-NEXT:    addi s9, sp, 32
+; RV32V-NEXT:    addi s9, sp, 40
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v1, v1, v0
 ; RV32V-NEXT:    li a1, 64
 ; RV32V-NEXT:    vand.vx v0, v9, a1
-; RV32V-NEXT:    addi s7, sp, 24
+; RV32V-NEXT:    addi s7, sp, 32
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v1, v1, v0
 ; RV32V-NEXT:    vand.vx v0, v9, s2
-; RV32V-NEXT:    addi s5, sp, 16
+; RV32V-NEXT:    addi s5, sp, 24
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v1, v1, v0
 ; RV32V-NEXT:    li a1, 256
@@ -5004,7 +5002,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    li a0, 16
 ; RV32V-NEXT:    li s6, 32
 ; RV32V-NEXT:    li s5, 64
-; RV32V-NEXT:    li s4, 128
+; RV32V-NEXT:    li s3, 128
 ; RV32V-NEXT:    li s1, 256
 ; RV32V-NEXT:    li s0, 512
 ; RV32V-NEXT:    li t5, 1024
@@ -5013,7 +5011,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    lui s10, 4
 ; RV32V-NEXT:    lui s11, 8
 ; RV32V-NEXT:    lui s9, 16
-; RV32V-NEXT:    lui s3, 32
+; RV32V-NEXT:    lui s4, 32
 ; RV32V-NEXT:    lui t6, 64
 ; RV32V-NEXT:    lui t4, 128
 ; RV32V-NEXT:    lui t3, 256
@@ -5024,82 +5022,81 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    lui a6, 8192
 ; RV32V-NEXT:    lui a5, 16384
 ; RV32V-NEXT:    lui a4, 32768
-; RV32V-NEXT:    sw a1, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw a1, 16(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s2, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s2, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a3, 260(sp)
+; RV32V-NEXT:    sw a3, 268(sp)
 ; RV32V-NEXT:    lui a3, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a2, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw a2, 260(sp)
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s7, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s7, 252(sp)
 ; RV32V-NEXT:    vsetvli s7, zero, e64, m2, ta, ma
-; RV32V-NEXT:    vand.vi v28, v10, 2
+; RV32V-NEXT:    vand.vi v24, v10, 2
 ; RV32V-NEXT:    vand.vi v20, v10, 1
-; RV32V-NEXT:    vand.vi v30, v10, 4
+; RV32V-NEXT:    vand.vi v26, v10, 4
 ; RV32V-NEXT:    vand.vi v14, v10, 8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw a0, 236(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw a0, 244(sp)
 ; RV32V-NEXT:    vand.vx v12, v10, a0
-; RV32V-NEXT:    addi s7, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s6, 228(sp)
+; RV32V-NEXT:    addi s7, sp, 16
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw s6, 236(sp)
 ; RV32V-NEXT:    vand.vx v16, v10, s6
-; RV32V-NEXT:    addi s6, sp, 264
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s5, 220(sp)
+; RV32V-NEXT:    addi s6, sp, 272
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s5, 228(sp)
 ; RV32V-NEXT:    vand.vx v18, v10, s5
-; RV32V-NEXT:    addi s5, sp, 256
+; RV32V-NEXT:    addi s5, sp, 264
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s3, 220(sp)
+; RV32V-NEXT:    vand.vx v0, v10, s3
+; RV32V-NEXT:    addi s3, sp, 256
 ; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s4, 212(sp)
-; RV32V-NEXT:    vand.vx v0, v10, s4
-; RV32V-NEXT:    addi s4, sp, 248
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s1, 204(sp)
+; RV32V-NEXT:    sw s1, 212(sp)
 ; RV32V-NEXT:    vand.vx v6, v10, s1
-; RV32V-NEXT:    addi s1, sp, 240
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s0, 196(sp)
+; RV32V-NEXT:    addi s1, sp, 248
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s0, 204(sp)
 ; RV32V-NEXT:    vand.vx v4, v10, s0
-; RV32V-NEXT:    addi s0, sp, 232
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw t5, 188(sp)
+; RV32V-NEXT:    addi s0, sp, 240
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw t5, 196(sp)
 ; RV32V-NEXT:    vand.vx v2, v10, t5
 ; RV32V-NEXT:    slli s2, s2, 11
-; RV32V-NEXT:    vand.vx v24, v10, ra
+; RV32V-NEXT:    vand.vx v28, v10, ra
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s2, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s2, 180(sp)
+; RV32V-NEXT:    sw ra, 180(sp)
+; RV32V-NEXT:    addi t5, sp, 224
+; RV32V-NEXT:    vand.vx v30, v10, s8
 ; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw ra, 172(sp)
-; RV32V-NEXT:    addi t5, sp, 216
-; RV32V-NEXT:    vand.vx v26, v10, s8
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s8, 164(sp)
-; RV32V-NEXT:    addi s8, sp, 208
+; RV32V-NEXT:    sw s8, 172(sp)
+; RV32V-NEXT:    addi s8, sp, 216
 ; RV32V-NEXT:    vand.vx v22, v10, s10
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s10, 156(sp)
-; RV32V-NEXT:    addi s10, sp, 200
-; RV32V-NEXT:    vmul.vv v28, v8, v28
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw s10, 164(sp)
+; RV32V-NEXT:    addi s10, sp, 208
+; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vmul.vv v20, v8, v20
-; RV32V-NEXT:    vxor.vi v20, v20, 0
-; RV32V-NEXT:    vxor.vv v20, v20, v28
-; RV32V-NEXT:    vand.vx v28, v10, s11
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vx v24, v10, s11
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s11, 156(sp)
+; RV32V-NEXT:    addi s11, sp, 200
+; RV32V-NEXT:    vmul.vv v26, v8, v26
+; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    vand.vx v26, v10, s9
 ; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s11, 148(sp)
-; RV32V-NEXT:    addi s11, sp, 192
-; RV32V-NEXT:    vmul.vv v30, v8, v30
-; RV32V-NEXT:    vxor.vv v20, v20, v30
-; RV32V-NEXT:    vand.vx v30, v10, s9
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s9, 140(sp)
-; RV32V-NEXT:    addi s9, sp, 184
+; RV32V-NEXT:    sw s9, 148(sp)
+; RV32V-NEXT:    addi s9, sp, 192
 ; RV32V-NEXT:    vmul.vv v14, v8, v14
 ; RV32V-NEXT:    vxor.vv v14, v20, v14
-; RV32V-NEXT:    vand.vx v20, v10, s3
+; RV32V-NEXT:    vand.vx v20, v10, s4
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv ra, a0
@@ -5108,39 +5105,39 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs2r.v v20, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s3, 132(sp)
-; RV32V-NEXT:    addi s3, sp, 176
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw s4, 140(sp)
+; RV32V-NEXT:    addi s4, sp, 184
 ; RV32V-NEXT:    vmul.vv v12, v8, v12
 ; RV32V-NEXT:    vxor.vv v12, v14, v12
 ; RV32V-NEXT:    vand.vx v14, v10, t6
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
-; RV32V-NEXT:    addi t6, sp, 168
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t6, 132(sp)
+; RV32V-NEXT:    addi t6, sp, 176
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    vand.vx v16, v10, t4
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    addi t4, sp, 160
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t4, 124(sp)
+; RV32V-NEXT:    addi t4, sp, 168
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v18, v12, v18
 ; RV32V-NEXT:    vand.vx v12, v10, t3
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    addi t3, sp, 152
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t3, 116(sp)
+; RV32V-NEXT:    addi t3, sp, 160
 ; RV32V-NEXT:    vmul.vv v20, v8, v0
 ; RV32V-NEXT:    vxor.vv v18, v18, v20
 ; RV32V-NEXT:    vand.vx v20, v10, t2
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    addi t2, sp, 144
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
+; RV32V-NEXT:    addi t2, sp, 152
 ; RV32V-NEXT:    vmul.vv v6, v8, v6
 ; RV32V-NEXT:    vxor.vv v18, v18, v6
 ; RV32V-NEXT:    vand.vx v6, v10, t1
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    addi t1, sp, 136
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
+; RV32V-NEXT:    addi t1, sp, 144
 ; RV32V-NEXT:    vmul.vv v4, v8, v4
 ; RV32V-NEXT:    vxor.vv v18, v18, v4
 ; RV32V-NEXT:    vand.vx v4, v10, t0
@@ -5154,55 +5151,55 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    addi t0, sp, 128
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
+; RV32V-NEXT:    addi t0, sp, 136
 ; RV32V-NEXT:    vmul.vv v2, v8, v2
 ; RV32V-NEXT:    vxor.vv v18, v18, v2
 ; RV32V-NEXT:    vand.vx v2, v10, s2
-; RV32V-NEXT:    addi ra, sp, 120
+; RV32V-NEXT:    addi ra, sp, 128
 ; RV32V-NEXT:    vmul.vv v2, v8, v2
 ; RV32V-NEXT:    vxor.vv v18, v18, v2
 ; RV32V-NEXT:    vand.vx v2, v10, a7
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    addi a7, sp, 112
-; RV32V-NEXT:    vmul.vv v24, v8, v24
-; RV32V-NEXT:    vxor.vv v18, v18, v24
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a7, 84(sp)
+; RV32V-NEXT:    addi a7, sp, 120
+; RV32V-NEXT:    vmul.vv v28, v8, v28
+; RV32V-NEXT:    vxor.vv v18, v18, v28
 ; RV32V-NEXT:    vand.vx v4, v10, a6
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a6, 76(sp)
+; RV32V-NEXT:    addi a6, sp, 112
+; RV32V-NEXT:    vmul.vv v30, v8, v30
+; RV32V-NEXT:    vxor.vv v18, v18, v30
+; RV32V-NEXT:    vand.vx v30, v10, a5
 ; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    addi a6, sp, 104
-; RV32V-NEXT:    vmul.vv v26, v8, v26
-; RV32V-NEXT:    vxor.vv v18, v18, v26
-; RV32V-NEXT:    vand.vx v26, v10, a5
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    addi a5, sp, 96
+; RV32V-NEXT:    sw a5, 68(sp)
+; RV32V-NEXT:    addi a5, sp, 104
 ; RV32V-NEXT:    vmul.vv v22, v8, v22
 ; RV32V-NEXT:    vxor.vv v18, v18, v22
-; RV32V-NEXT:    vand.vx v24, v10, a4
+; RV32V-NEXT:    vand.vx v28, v10, a4
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a4, 60(sp)
+; RV32V-NEXT:    addi a4, sp, 96
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    vxor.vv v18, v18, v24
+; RV32V-NEXT:    vand.vx v24, v10, a3
 ; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a4, 52(sp)
-; RV32V-NEXT:    addi a4, sp, 88
-; RV32V-NEXT:    vmul.vv v28, v8, v28
-; RV32V-NEXT:    vxor.vv v18, v18, v28
-; RV32V-NEXT:    vand.vx v28, v10, a3
+; RV32V-NEXT:    sw a3, 52(sp)
+; RV32V-NEXT:    addi a3, sp, 88
+; RV32V-NEXT:    vmul.vv v26, v8, v26
+; RV32V-NEXT:    vxor.vv v18, v18, v26
+; RV32V-NEXT:    vand.vx v26, v10, a2
 ; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a3, 44(sp)
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    vmul.vv v30, v8, v30
-; RV32V-NEXT:    vxor.vv v18, v18, v30
-; RV32V-NEXT:    vand.vx v30, v10, a2
+; RV32V-NEXT:    sw a2, 44(sp)
+; RV32V-NEXT:    addi a2, sp, 80
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a2, 36(sp)
-; RV32V-NEXT:    addi a2, sp, 72
-; RV32V-NEXT:    sw zero, 24(sp)
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a1, 20(sp)
-; RV32V-NEXT:    addi a1, sp, 64
+; RV32V-NEXT:    sw a0, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw a1, 28(sp)
+; RV32V-NEXT:    addi a1, sp, 72
 ; RV32V-NEXT:    sw a6, 4(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    csrr a6, vlenb
 ; RV32V-NEXT:    slli a6, a6, 3
@@ -5223,7 +5220,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    add a6, sp, a6
 ; RV32V-NEXT:    addi a6, a6, 288
 ; RV32V-NEXT:    vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 56
+; RV32V-NEXT:    addi s7, sp, 64
 ; RV32V-NEXT:    vmul.vv v14, v8, v14
 ; RV32V-NEXT:    vxor.vv v14, v0, v14
 ; RV32V-NEXT:    vlse64.v v18, (s6), zero
@@ -5235,7 +5232,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    add a6, sp, a6
 ; RV32V-NEXT:    addi a6, a6, 288
 ; RV32V-NEXT:    vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s2, sp, 48
+; RV32V-NEXT:    addi s2, sp, 56
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vxor.vv v14, v14, v16
 ; RV32V-NEXT:    vlse64.v v16, (s5), zero
@@ -5247,16 +5244,16 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    add a6, sp, a6
 ; RV32V-NEXT:    addi a6, a6, 288
 ; RV32V-NEXT:    vs2r.v v16, (a6) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s5, sp, 40
+; RV32V-NEXT:    addi s5, sp, 48
 ; RV32V-NEXT:    vmul.vv v12, v8, v12
 ; RV32V-NEXT:    vxor.vv v12, v14, v12
-; RV32V-NEXT:    vlse64.v v14, (s4), zero
+; RV32V-NEXT:    vlse64.v v14, (s3), zero
 ; RV32V-NEXT:    csrr a6, vlenb
 ; RV32V-NEXT:    slli a6, a6, 5
 ; RV32V-NEXT:    add a6, sp, a6
 ; RV32V-NEXT:    addi a6, a6, 288
 ; RV32V-NEXT:    vs2r.v v14, (a6) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s4, sp, 32
+; RV32V-NEXT:    addi s3, sp, 40
 ; RV32V-NEXT:    vmul.vv v20, v8, v20
 ; RV32V-NEXT:    vxor.vv v20, v12, v20
 ; RV32V-NEXT:    vlse64.v v12, (s1), zero
@@ -5272,7 +5269,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    add a6, sp, a6
 ; RV32V-NEXT:    addi a6, a6, 288
 ; RV32V-NEXT:    vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s1, sp, 24
+; RV32V-NEXT:    addi s1, sp, 32
 ; RV32V-NEXT:    vmul.vv v6, v8, v6
 ; RV32V-NEXT:    vxor.vv v20, v20, v6
 ; RV32V-NEXT:    vlse64.v v12, (s0), zero
@@ -5286,7 +5283,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    add a6, sp, a6
 ; RV32V-NEXT:    addi a6, a6, 288
 ; RV32V-NEXT:    vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s0, sp, 16
+; RV32V-NEXT:    addi s0, sp, 24
 ; RV32V-NEXT:    csrr s6, vlenb
 ; RV32V-NEXT:    slli s6, s6, 1
 ; RV32V-NEXT:    mv a6, s6
@@ -5301,48 +5298,48 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    vmul.vv v6, v8, v12
 ; RV32V-NEXT:    vmul.vv v2, v8, v2
 ; RV32V-NEXT:    vmul.vv v4, v8, v4
-; RV32V-NEXT:    vmul.vv v26, v8, v26
-; RV32V-NEXT:    vmul.vv v24, v8, v24
-; RV32V-NEXT:    vmul.vv v28, v8, v28
 ; RV32V-NEXT:    vmul.vv v30, v8, v30
+; RV32V-NEXT:    vmul.vv v28, v8, v28
+; RV32V-NEXT:    vmul.vv v24, v8, v24
+; RV32V-NEXT:    vmul.vv v26, v8, v26
 ; RV32V-NEXT:    vxor.vv v20, v20, v6
-; RV32V-NEXT:    addi s6, sp, 224
+; RV32V-NEXT:    addi s6, sp, 232
 ; RV32V-NEXT:    vlse64.v v0, (s6), zero
 ; RV32V-NEXT:    vxor.vv v20, v20, v2
 ; RV32V-NEXT:    vlse64.v v6, (t5), zero
 ; RV32V-NEXT:    vxor.vv v20, v20, v4
 ; RV32V-NEXT:    vlse64.v v22, (s8), zero
-; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    vxor.vv v20, v20, v30
 ; RV32V-NEXT:    vlse64.v v18, (s10), zero
-; RV32V-NEXT:    vxor.vv v20, v20, v24
-; RV32V-NEXT:    vlse64.v v16, (s11), zero
 ; RV32V-NEXT:    vxor.vv v20, v20, v28
+; RV32V-NEXT:    vlse64.v v16, (s11), zero
+; RV32V-NEXT:    vxor.vv v20, v20, v24
 ; RV32V-NEXT:    vlse64.v v14, (s9), zero
-; RV32V-NEXT:    vxor.vv v2, v20, v30
-; RV32V-NEXT:    vlse64.v v12, (s3), zero
+; RV32V-NEXT:    vxor.vv v2, v20, v26
+; RV32V-NEXT:    vlse64.v v12, (s4), zero
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
 ; RV32V-NEXT:    vand.vv v26, v10, v20
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
 ; RV32V-NEXT:    vand.vv v4, v10, v20
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 4
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
@@ -5355,24 +5352,24 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    vand.vv v20, v10, v20
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s3, s3, t5
+; RV32V-NEXT:    add s4, s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s3, s3, t5
+; RV32V-NEXT:    add s4, s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
 ; RV32V-NEXT:    vand.vv v28, v10, v24
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s3, s3, t5
+; RV32V-NEXT:    add s4, s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
@@ -5389,31 +5386,31 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    vand.vv v16, v10, v16
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vs2r.v v16, (t5) # vscale x 16-byte Folded Spill
 ; RV32V-NEXT:    vand.vv v14, v10, v14
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s3, s3, t5
+; RV32V-NEXT:    add s4, s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s3, s3, t5
+; RV32V-NEXT:    add s4, s4, t5
 ; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vs2r.v v14, (t5) # vscale x 16-byte Folded Spill
 ; RV32V-NEXT:    vand.vv v12, v10, v12
 ; RV32V-NEXT:    csrr t5, vlenb
 ; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s3, t5
+; RV32V-NEXT:    mv s4, t5
 ; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    add t5, t5, s3
+; RV32V-NEXT:    add t5, t5, s4
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vs2r.v v12, (t5) # vscale x 16-byte Folded Spill
@@ -5570,7 +5567,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
 ; RV32V-NEXT:    vlse64.v v14, (s5), zero
-; RV32V-NEXT:    vlse64.v v16, (s4), zero
+; RV32V-NEXT:    vlse64.v v16, (s3), zero
 ; RV32V-NEXT:    vlse64.v v18, (s1), zero
 ; RV32V-NEXT:    vlse64.v v12, (s0), zero
 ; RV32V-NEXT:    vand.vv v14, v10, v14
@@ -6167,97 +6164,97 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32V-NEXT:    vlse64.v v10, (a0), zero
 ; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    sw s5, 272(sp)
+; RV32V-NEXT:    sw s5, 16(sp)
 ; RV32V-NEXT:    lui t1, 524288
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s11, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s11, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw t2, 260(sp)
+; RV32V-NEXT:    sw t2, 268(sp)
 ; RV32V-NEXT:    lui a0, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s6, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s6, 260(sp)
 ; RV32V-NEXT:    lui t2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw ra, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw ra, 252(sp)
 ; RV32V-NEXT:    lui a1, 262144
-; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
 ; RV32V-NEXT:    li s6, 16
-; RV32V-NEXT:    sw s6, 236(sp)
+; RV32V-NEXT:    sw s6, 244(sp)
 ; RV32V-NEXT:    li s6, 16
-; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw zero, 232(sp)
 ; RV32V-NEXT:    li s5, 32
-; RV32V-NEXT:    sw s5, 228(sp)
+; RV32V-NEXT:    sw s5, 236(sp)
 ; RV32V-NEXT:    li s5, 32
-; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
 ; RV32V-NEXT:    li ra, 64
-; RV32V-NEXT:    sw ra, 220(sp)
+; RV32V-NEXT:    sw ra, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s10, 220(sp)
 ; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s10, 212(sp)
+; RV32V-NEXT:    sw s9, 212(sp)
 ; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s9, 204(sp)
+; RV32V-NEXT:    sw s8, 204(sp)
 ; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s8, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s7, 188(sp)
+; RV32V-NEXT:    sw s7, 196(sp)
 ; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s11, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s11, 180(sp)
+; RV32V-NEXT:    sw s4, 180(sp)
 ; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s4, 172(sp)
+; RV32V-NEXT:    sw s3, 172(sp)
 ; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s3, 164(sp)
+; RV32V-NEXT:    sw s2, 164(sp)
 ; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s2, 156(sp)
+; RV32V-NEXT:    sw s1, 156(sp)
 ; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s1, 148(sp)
+; RV32V-NEXT:    sw s0, 148(sp)
 ; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s0, 140(sp)
+; RV32V-NEXT:    sw t6, 140(sp)
 ; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t6, 132(sp)
+; RV32V-NEXT:    sw t5, 132(sp)
 ; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t5, 124(sp)
+; RV32V-NEXT:    sw t4, 124(sp)
 ; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
+; RV32V-NEXT:    sw t3, 116(sp)
 ; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
+; RV32V-NEXT:    sw t0, 108(sp)
 ; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t0, 100(sp)
+; RV32V-NEXT:    sw a7, 100(sp)
 ; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw a7, 92(sp)
+; RV32V-NEXT:    sw a6, 92(sp)
 ; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a6, 84(sp)
+; RV32V-NEXT:    sw a5, 84(sp)
 ; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a5, 76(sp)
+; RV32V-NEXT:    sw a4, 76(sp)
 ; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a4, 68(sp)
+; RV32V-NEXT:    sw a3, 68(sp)
 ; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a3, 60(sp)
+; RV32V-NEXT:    sw a2, 60(sp)
 ; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a2, 52(sp)
+; RV32V-NEXT:    sw a0, 52(sp)
 ; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a0, 44(sp)
+; RV32V-NEXT:    sw t2, 44(sp)
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw t2, 36(sp)
+; RV32V-NEXT:    sw a1, 36(sp)
 ; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a1, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw t1, 20(sp)
-; RV32V-NEXT:    addi a1, sp, 272
+; RV32V-NEXT:    sw t1, 28(sp)
+; RV32V-NEXT:    addi a1, sp, 16
 ; RV32V-NEXT:    vlse64.v v12, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 264
+; RV32V-NEXT:    addi a1, sp, 272
 ; RV32V-NEXT:    vlse64.v v22, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 256
+; RV32V-NEXT:    addi a1, sp, 264
 ; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 248
+; RV32V-NEXT:    addi a1, sp, 256
 ; RV32V-NEXT:    vlse64.v v2, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 240
+; RV32V-NEXT:    addi a1, sp, 248
 ; RV32V-NEXT:    vlse64.v v26, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 232
+; RV32V-NEXT:    addi a1, sp, 240
 ; RV32V-NEXT:    vlse64.v v28, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 224
+; RV32V-NEXT:    addi a1, sp, 232
 ; RV32V-NEXT:    vlse64.v v30, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 216
+; RV32V-NEXT:    addi a1, sp, 224
 ; RV32V-NEXT:    vlse64.v v16, (a1), zero
 ; RV32V-NEXT:    csrr a1, vlenb
 ; RV32V-NEXT:    slli a1, a1, 1
@@ -6269,7 +6266,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    add a1, sp, a1
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 208
+; RV32V-NEXT:    addi a1, sp, 216
 ; RV32V-NEXT:    vlse64.v v16, (a1), zero
 ; RV32V-NEXT:    csrr a1, vlenb
 ; RV32V-NEXT:    slli a1, a1, 3
@@ -6279,7 +6276,7 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    add a1, sp, a1
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 200
+; RV32V-NEXT:    addi a1, sp, 208
 ; RV32V-NEXT:    vlse64.v v16, (a1), zero
 ; RV32V-NEXT:    csrr a1, vlenb
 ; RV32V-NEXT:    slli a1, a1, 1
@@ -6291,16 +6288,15 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    add a1, sp, a1
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 192
+; RV32V-NEXT:    addi a1, sp, 200
 ; RV32V-NEXT:    vlse64.v v6, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 184
+; RV32V-NEXT:    addi a1, sp, 192
 ; RV32V-NEXT:    vlse64.v v4, (a1), zero
-; RV32V-NEXT:    addi ra, sp, 176
+; RV32V-NEXT:    addi ra, sp, 184
 ; RV32V-NEXT:    vand.vi v16, v10, 2
 ; RV32V-NEXT:    vand.vi v18, v10, 1
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
-; RV32V-NEXT:    vxor.vi v18, v18, 0
 ; RV32V-NEXT:    vxor.vv v16, v18, v16
 ; RV32V-NEXT:    vand.vi v18, v10, 4
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
@@ -6309,97 +6305,97 @@ define <vscale x 2 x i64> @clmul_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    vand.vx v18, v10, s6
-; RV32V-NEXT:    addi s10, sp, 168
+; RV32V-NEXT:    addi s10, sp, 176
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    vand.vx v18, v10, s5
-; RV32V-NEXT:    addi s9, sp, 160
+; RV32V-NEXT:    addi s9, sp, 168
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    li a1, 64
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s8, sp, 152
+; RV32V-NEXT:    addi s8, sp, 160
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    li a1, 128
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s7, sp, 144
+; RV32V-NEXT:    addi s7, sp, 152
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    li a1, 256
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s6, sp, 136
+; RV32V-NEXT:    addi s6, sp, 144
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    li a1, 512
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s5, sp, 128
+; RV32V-NEXT:    addi s5, sp, 136
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    li a1, 1024
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s4, sp, 120
+; RV32V-NEXT:    addi s4, sp, 128
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    vand.vx v18, v10, s11
-; RV32V-NEXT:    addi s11, sp, 112
+; RV32V-NEXT:    addi s11, sp, 120
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 1
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s3, sp, 104
+; RV32V-NEXT:    addi s3, sp, 112
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 2
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s2, sp, 96
+; RV32V-NEXT:    addi s2, sp, 104
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 4
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s1, sp, 88
+; RV32V-NEXT:    addi s1, sp, 96
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 8
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi s0, sp, 80
+; RV32V-NEXT:    addi s0, sp, 88
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 16
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi t6, sp, 72
+; RV32V-NEXT:    addi t6, sp, 80
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 32
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi t5, sp, 64
+; RV32V-NEXT:    addi t5, sp, 72
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 64
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi t4, sp, 56
+; RV32V-NEXT:    addi t4, sp, 64
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 128
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi t3, sp, 48
+; RV32V-NEXT:    addi t3, sp, 56
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 256
 ; RV32V-NEXT:    vand.vx v18, v10, a1
-; RV32V-NEXT:    addi t2, sp, 40
+; RV32V-NEXT:    addi t2, sp, 48
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    vand.vx v18, v10, t0
-; RV32V-NEXT:    addi t1, sp, 32
+; RV32V-NEXT:    addi t1, sp, 40
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    vand.vx v18, v10, a7
-; RV32V-NEXT:    addi a7, sp, 24
+; RV32V-NEXT:    addi a7, sp, 32
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    vand.vx v18, v10, a6
-; RV32V-NEXT:    addi a5, sp, 16
+; RV32V-NEXT:    addi a5, sp, 24
 ; RV32V-NEXT:    vmul.vv v18, v8, v18
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
 ; RV32V-NEXT:    lui a1, 4096
@@ -7241,22 +7237,22 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    sub sp, sp, a0
 ; RV32V-NEXT:    lui a1, 524288
-; RV32V-NEXT:    li s4, 1
+; RV32V-NEXT:    li s5, 1
 ; RV32V-NEXT:    li a3, 2
 ; RV32V-NEXT:    li a2, 4
 ; RV32V-NEXT:    li a0, 8
 ; RV32V-NEXT:    li s3, 16
 ; RV32V-NEXT:    li s2, 32
-; RV32V-NEXT:    li s5, 64
+; RV32V-NEXT:    li s4, 64
 ; RV32V-NEXT:    li s6, 128
 ; RV32V-NEXT:    li s8, 256
-; RV32V-NEXT:    li s1, 512
+; RV32V-NEXT:    li s0, 512
 ; RV32V-NEXT:    li s7, 1024
 ; RV32V-NEXT:    lui ra, 1
 ; RV32V-NEXT:    lui s11, 2
 ; RV32V-NEXT:    lui s10, 4
 ; RV32V-NEXT:    lui s9, 8
-; RV32V-NEXT:    lui s0, 16
+; RV32V-NEXT:    lui s1, 16
 ; RV32V-NEXT:    lui t6, 32
 ; RV32V-NEXT:    lui t5, 64
 ; RV32V-NEXT:    lui t4, 128
@@ -7268,25 +7264,25 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    lui a6, 8192
 ; RV32V-NEXT:    lui a5, 16384
 ; RV32V-NEXT:    lui a4, 32768
-; RV32V-NEXT:    sw a1, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw a1, 16(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s5, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s4, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a3, 260(sp)
+; RV32V-NEXT:    sw a3, 268(sp)
 ; RV32V-NEXT:    lui a3, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a2, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw a2, 260(sp)
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a0, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw a0, 252(sp)
 ; RV32V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32V-NEXT:    vand.vi v28, v12, 2
 ; RV32V-NEXT:    vand.vi v4, v12, 1
 ; RV32V-NEXT:    vand.vi v24, v12, 4
 ; RV32V-NEXT:    vand.vi v20, v12, 8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s3, 236(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw s3, 244(sp)
 ; RV32V-NEXT:    vand.vx v16, v12, s3
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 2
@@ -7300,33 +7296,32 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi s3, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s2, 228(sp)
+; RV32V-NEXT:    addi s3, sp, 16
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw s2, 236(sp)
 ; RV32V-NEXT:    vand.vx v0, v12, s2
-; RV32V-NEXT:    addi s2, sp, 264
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s5, 220(sp)
+; RV32V-NEXT:    addi s2, sp, 272
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s4, 228(sp)
 ; RV32V-NEXT:    vmul.vv v16, v8, v28
 ; RV32V-NEXT:    vmul.vv v28, v8, v4
-; RV32V-NEXT:    vxor.vi v28, v28, 0
 ; RV32V-NEXT:    vxor.vv v28, v28, v16
-; RV32V-NEXT:    vand.vx v16, v12, s5
-; RV32V-NEXT:    addi s5, sp, 256
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s6, 212(sp)
+; RV32V-NEXT:    vand.vx v16, v12, s4
+; RV32V-NEXT:    addi s4, sp, 264
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s6, 220(sp)
 ; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vxor.vv v28, v28, v24
 ; RV32V-NEXT:    vand.vx v24, v12, s6
-; RV32V-NEXT:    addi s6, sp, 248
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s8, 204(sp)
+; RV32V-NEXT:    addi s6, sp, 256
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s8, 212(sp)
 ; RV32V-NEXT:    vmul.vv v20, v8, v20
 ; RV32V-NEXT:    vxor.vv v20, v28, v20
 ; RV32V-NEXT:    vand.vx v28, v12, s8
-; RV32V-NEXT:    addi s8, sp, 240
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s1, 196(sp)
+; RV32V-NEXT:    addi s8, sp, 248
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s0, 204(sp)
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
@@ -7341,97 +7336,97 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
 ; RV32V-NEXT:    vmul.vv v4, v8, v4
 ; RV32V-NEXT:    vxor.vv v20, v20, v4
-; RV32V-NEXT:    vand.vx v4, v12, s1
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s7, 188(sp)
+; RV32V-NEXT:    vand.vx v4, v12, s0
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s7, 196(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v20, v20, v0
 ; RV32V-NEXT:    vand.vx v0, v12, s7
-; RV32V-NEXT:    slli a0, s4, 11
+; RV32V-NEXT:    slli a0, s5, 11
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vxor.vv v20, v20, v16
 ; RV32V-NEXT:    vand.vx v16, v12, ra
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw a0, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a0, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw ra, 172(sp)
-; RV32V-NEXT:    addi s4, sp, 216
+; RV32V-NEXT:    sw ra, 180(sp)
+; RV32V-NEXT:    addi s5, sp, 224
 ; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vxor.vv v24, v20, v24
 ; RV32V-NEXT:    vand.vx v20, v12, s11
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s11, 164(sp)
-; RV32V-NEXT:    addi s11, sp, 208
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s11, 172(sp)
+; RV32V-NEXT:    addi s11, sp, 216
 ; RV32V-NEXT:    vmul.vv v28, v8, v28
 ; RV32V-NEXT:    vxor.vv v28, v24, v28
 ; RV32V-NEXT:    vand.vx v24, v12, s10
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s10, 156(sp)
-; RV32V-NEXT:    addi s10, sp, 200
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw s10, 164(sp)
+; RV32V-NEXT:    addi s10, sp, 208
 ; RV32V-NEXT:    vmul.vv v4, v8, v4
 ; RV32V-NEXT:    vxor.vv v4, v28, v4
 ; RV32V-NEXT:    vand.vx v28, v12, s9
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s9, 148(sp)
-; RV32V-NEXT:    addi s9, sp, 192
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s9, 156(sp)
+; RV32V-NEXT:    addi s9, sp, 200
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi ra, sp, 184
+; RV32V-NEXT:    addi ra, sp, 192
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v0, v4, v0
-; RV32V-NEXT:    vand.vx v4, v12, s0
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s0, 140(sp)
-; RV32V-NEXT:    addi s1, sp, 176
+; RV32V-NEXT:    vand.vx v4, v12, s1
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw s1, 148(sp)
+; RV32V-NEXT:    addi s1, sp, 184
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vxor.vv v0, v0, v16
 ; RV32V-NEXT:    vand.vx v16, v12, t6
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t6, 132(sp)
-; RV32V-NEXT:    addi s0, sp, 168
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw t6, 140(sp)
+; RV32V-NEXT:    addi s0, sp, 176
 ; RV32V-NEXT:    vmul.vv v20, v8, v20
 ; RV32V-NEXT:    vxor.vv v0, v0, v20
 ; RV32V-NEXT:    vand.vx v20, v12, t5
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t5, 124(sp)
-; RV32V-NEXT:    addi t6, sp, 160
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t5, 132(sp)
+; RV32V-NEXT:    addi t6, sp, 168
 ; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vxor.vv v0, v0, v24
 ; RV32V-NEXT:    vand.vx v24, v12, t4
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    addi t5, sp, 152
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t4, 124(sp)
+; RV32V-NEXT:    addi t5, sp, 160
 ; RV32V-NEXT:    vmul.vv v28, v8, v28
 ; RV32V-NEXT:    vxor.vv v0, v0, v28
 ; RV32V-NEXT:    vand.vx v28, v12, t3
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    addi t4, sp, 144
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t3, 116(sp)
+; RV32V-NEXT:    addi t4, sp, 152
 ; RV32V-NEXT:    vmul.vv v4, v8, v4
 ; RV32V-NEXT:    vxor.vv v0, v0, v4
 ; RV32V-NEXT:    vand.vx v4, v12, t2
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    addi t3, sp, 136
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
+; RV32V-NEXT:    addi t3, sp, 144
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vxor.vv v16, v0, v16
 ; RV32V-NEXT:    vand.vx v0, v12, t1
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    addi t2, sp, 128
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
+; RV32V-NEXT:    addi t2, sp, 136
 ; RV32V-NEXT:    vmul.vv v20, v8, v20
 ; RV32V-NEXT:    vxor.vv v20, v16, v20
 ; RV32V-NEXT:    vand.vx v16, v12, t0
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    addi t1, sp, 120
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
+; RV32V-NEXT:    addi t1, sp, 128
 ; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vxor.vv v24, v20, v24
 ; RV32V-NEXT:    vand.vx v20, v12, a7
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    addi t0, sp, 112
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a7, 84(sp)
+; RV32V-NEXT:    addi t0, sp, 120
 ; RV32V-NEXT:    vmul.vv v28, v8, v28
 ; RV32V-NEXT:    vxor.vv v24, v24, v28
 ; RV32V-NEXT:    vand.vx v28, v12, a6
@@ -7447,9 +7442,9 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    addi a7, sp, 104
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a6, 76(sp)
+; RV32V-NEXT:    addi a7, sp, 112
 ; RV32V-NEXT:    vmul.vv v28, v8, v4
 ; RV32V-NEXT:    vxor.vv v24, v24, v28
 ; RV32V-NEXT:    vand.vx v28, v12, a5
@@ -7463,34 +7458,34 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    addi a6, sp, 96
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a5, 68(sp)
+; RV32V-NEXT:    addi a6, sp, 104
 ; RV32V-NEXT:    vmul.vv v28, v8, v0
 ; RV32V-NEXT:    vxor.vv v28, v24, v28
 ; RV32V-NEXT:    vand.vx v24, v12, a4
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a4, 52(sp)
-; RV32V-NEXT:    addi a5, sp, 88
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a4, 60(sp)
+; RV32V-NEXT:    addi a5, sp, 96
 ; RV32V-NEXT:    vmul.vv v16, v8, v16
 ; RV32V-NEXT:    vxor.vv v16, v28, v16
 ; RV32V-NEXT:    vand.vx v28, v12, a3
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a3, 44(sp)
-; RV32V-NEXT:    addi a4, sp, 80
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw a3, 52(sp)
+; RV32V-NEXT:    addi a4, sp, 88
 ; RV32V-NEXT:    vmul.vv v20, v8, v20
 ; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    vand.vx v4, v12, a2
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw a2, 44(sp)
+; RV32V-NEXT:    addi a3, sp, 80
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a2, 36(sp)
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    sw zero, 24(sp)
 ; RV32V-NEXT:    lui a1, 262144
-; RV32V-NEXT:    sw a1, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    sw a1, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    sw a0, 20(sp)
-; RV32V-NEXT:    addi a2, sp, 64
+; RV32V-NEXT:    sw a0, 28(sp)
+; RV32V-NEXT:    addi a2, sp, 72
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv s7, a0
@@ -7506,7 +7501,7 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    vmul.vv v20, v8, v20
 ; RV32V-NEXT:    vxor.vv v20, v16, v20
 ; RV32V-NEXT:    vlse64.v v16, (s3), zero
-; RV32V-NEXT:    addi s3, sp, 56
+; RV32V-NEXT:    addi s3, sp, 64
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv s7, a0
@@ -7520,15 +7515,15 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v0, v20, v0
 ; RV32V-NEXT:    vlse64.v v20, (s2), zero
-; RV32V-NEXT:    addi s2, sp, 48
+; RV32V-NEXT:    addi s2, sp, 56
 ; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vlse64.v v24, (s5), zero
-; RV32V-NEXT:    addi s5, sp, 40
+; RV32V-NEXT:    vlse64.v v24, (s4), zero
+; RV32V-NEXT:    addi s4, sp, 48
 ; RV32V-NEXT:    vmul.vv v28, v8, v28
 ; RV32V-NEXT:    vxor.vv v0, v0, v28
 ; RV32V-NEXT:    vlse64.v v28, (s6), zero
-; RV32V-NEXT:    addi s6, sp, 32
+; RV32V-NEXT:    addi s6, sp, 40
 ; RV32V-NEXT:    vmul.vv v4, v8, v4
 ; RV32V-NEXT:    vxor.vv v4, v0, v4
 ; RV32V-NEXT:    csrr a0, vlenb
@@ -7544,7 +7539,7 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    vlse64.v v4, (s8), zero
-; RV32V-NEXT:    addi s8, sp, 24
+; RV32V-NEXT:    addi s8, sp, 32
 ; RV32V-NEXT:    vand.vv v16, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 5
@@ -7600,52 +7595,52 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 16
-; RV32V-NEXT:    addi s7, sp, 232
+; RV32V-NEXT:    addi a0, sp, 24
+; RV32V-NEXT:    addi s7, sp, 240
 ; RV32V-NEXT:    vlse64.v v16, (s7), zero
-; RV32V-NEXT:    addi s7, sp, 224
+; RV32V-NEXT:    addi s7, sp, 232
 ; RV32V-NEXT:    vlse64.v v20, (s7), zero
-; RV32V-NEXT:    vlse64.v v24, (s4), zero
+; RV32V-NEXT:    vlse64.v v24, (s5), zero
 ; RV32V-NEXT:    vlse64.v v28, (s11), zero
 ; RV32V-NEXT:    vand.vv v16, v12, v16
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 4
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr s5, vlenb
+; RV32V-NEXT:    slli s5, s5, 4
+; RV32V-NEXT:    add s5, sp, s5
+; RV32V-NEXT:    addi s5, s5, 288
+; RV32V-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    vand.vv v16, v12, v20
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    mv s7, s4
-; RV32V-NEXT:    slli s4, s4, 1
-; RV32V-NEXT:    add s7, s7, s4
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    add s4, s4, s7
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr s5, vlenb
+; RV32V-NEXT:    slli s5, s5, 2
+; RV32V-NEXT:    mv s7, s5
+; RV32V-NEXT:    slli s5, s5, 1
+; RV32V-NEXT:    add s7, s7, s5
+; RV32V-NEXT:    slli s5, s5, 2
+; RV32V-NEXT:    add s5, s5, s7
+; RV32V-NEXT:    add s5, sp, s5
+; RV32V-NEXT:    addi s5, s5, 288
+; RV32V-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    vand.vv v16, v12, v24
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    mv s7, s4
-; RV32V-NEXT:    slli s4, s4, 4
-; RV32V-NEXT:    add s4, s4, s7
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr s5, vlenb
+; RV32V-NEXT:    slli s5, s5, 2
+; RV32V-NEXT:    mv s7, s5
+; RV32V-NEXT:    slli s5, s5, 4
+; RV32V-NEXT:    add s5, s5, s7
+; RV32V-NEXT:    add s5, sp, s5
+; RV32V-NEXT:    addi s5, s5, 288
+; RV32V-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    vand.vv v16, v12, v28
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    mv s7, s4
-; RV32V-NEXT:    slli s4, s4, 1
-; RV32V-NEXT:    add s7, s7, s4
-; RV32V-NEXT:    slli s4, s4, 1
-; RV32V-NEXT:    add s7, s7, s4
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    add s4, s4, s7
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr s5, vlenb
+; RV32V-NEXT:    slli s5, s5, 2
+; RV32V-NEXT:    mv s7, s5
+; RV32V-NEXT:    slli s5, s5, 1
+; RV32V-NEXT:    add s7, s7, s5
+; RV32V-NEXT:    slli s5, s5, 1
+; RV32V-NEXT:    add s7, s7, s5
+; RV32V-NEXT:    slli s5, s5, 2
+; RV32V-NEXT:    add s5, s5, s7
+; RV32V-NEXT:    add s5, sp, s5
+; RV32V-NEXT:    addi s5, s5, 288
+; RV32V-NEXT:    vs4r.v v16, (s5) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    vlse64.v v20, (s10), zero
 ; RV32V-NEXT:    vlse64.v v24, (s9), zero
 ; RV32V-NEXT:    vlse64.v v28, (ra), zero
@@ -7653,18 +7648,18 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    vand.vv v16, v12, v20
 ; RV32V-NEXT:    csrr s1, vlenb
 ; RV32V-NEXT:    slli s1, s1, 2
-; RV32V-NEXT:    mv s4, s1
+; RV32V-NEXT:    mv s5, s1
 ; RV32V-NEXT:    slli s1, s1, 1
-; RV32V-NEXT:    add s1, s1, s4
+; RV32V-NEXT:    add s1, s1, s5
 ; RV32V-NEXT:    add s1, sp, s1
 ; RV32V-NEXT:    addi s1, s1, 288
 ; RV32V-NEXT:    vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    vand.vv v16, v12, v24
 ; RV32V-NEXT:    csrr s1, vlenb
 ; RV32V-NEXT:    slli s1, s1, 3
-; RV32V-NEXT:    mv s4, s1
+; RV32V-NEXT:    mv s5, s1
 ; RV32V-NEXT:    slli s1, s1, 2
-; RV32V-NEXT:    add s1, s1, s4
+; RV32V-NEXT:    add s1, s1, s5
 ; RV32V-NEXT:    add s1, sp, s1
 ; RV32V-NEXT:    addi s1, s1, 288
 ; RV32V-NEXT:    vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
@@ -7677,11 +7672,11 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    vand.vv v16, v12, v4
 ; RV32V-NEXT:    csrr s1, vlenb
 ; RV32V-NEXT:    slli s1, s1, 3
-; RV32V-NEXT:    mv s4, s1
+; RV32V-NEXT:    mv s5, s1
 ; RV32V-NEXT:    slli s1, s1, 1
-; RV32V-NEXT:    add s4, s4, s1
+; RV32V-NEXT:    add s5, s5, s1
 ; RV32V-NEXT:    slli s1, s1, 2
-; RV32V-NEXT:    add s1, s1, s4
+; RV32V-NEXT:    add s1, s1, s5
 ; RV32V-NEXT:    add s1, sp, s1
 ; RV32V-NEXT:    addi s1, s1, 288
 ; RV32V-NEXT:    vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
@@ -7834,7 +7829,7 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32V-NEXT:    add a2, sp, a2
 ; RV32V-NEXT:    addi a2, a2, 288
 ; RV32V-NEXT:    vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v16, (s5), zero
+; RV32V-NEXT:    vlse64.v v16, (s4), zero
 ; RV32V-NEXT:    vlse64.v v20, (s6), zero
 ; RV32V-NEXT:    vlse64.v v24, (s8), zero
 ; RV32V-NEXT:    vlse64.v v28, (a0), zero
@@ -8532,98 +8527,97 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
 ; RV32V-NEXT:    vlse64.v v12, (a1), zero
 ; RV32V-NEXT:    lui a3, 32768
-; RV32V-NEXT:    sw a2, 272(sp)
+; RV32V-NEXT:    sw a2, 16(sp)
 ; RV32V-NEXT:    lui t3, 524288
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s11, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s11, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a0, 260(sp)
+; RV32V-NEXT:    sw a0, 268(sp)
 ; RV32V-NEXT:    lui a2, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s9, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s9, 260(sp)
 ; RV32V-NEXT:    lui a1, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s10, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s10, 252(sp)
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
 ; RV32V-NEXT:    li a4, 16
-; RV32V-NEXT:    sw a4, 236(sp)
+; RV32V-NEXT:    sw a4, 244(sp)
 ; RV32V-NEXT:    li s10, 16
-; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw zero, 232(sp)
 ; RV32V-NEXT:    li a4, 32
-; RV32V-NEXT:    sw a4, 228(sp)
+; RV32V-NEXT:    sw a4, 236(sp)
 ; RV32V-NEXT:    li a4, 32
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw ra, 220(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw ra, 228(sp)
 ; RV32V-NEXT:    li ra, 64
-; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
 ; RV32V-NEXT:    li s9, 128
-; RV32V-NEXT:    sw s9, 212(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
 ; RV32V-NEXT:    li s9, 128
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s8, 204(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s8, 212(sp)
 ; RV32V-NEXT:    li s8, 256
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s7, 204(sp)
 ; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s7, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s6, 188(sp)
+; RV32V-NEXT:    sw s6, 196(sp)
 ; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s11, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s11, 180(sp)
+; RV32V-NEXT:    sw s5, 180(sp)
 ; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s5, 172(sp)
+; RV32V-NEXT:    sw s4, 172(sp)
 ; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s4, 164(sp)
+; RV32V-NEXT:    sw s3, 164(sp)
 ; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s3, 156(sp)
+; RV32V-NEXT:    sw s2, 156(sp)
 ; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s2, 148(sp)
+; RV32V-NEXT:    sw s1, 148(sp)
 ; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s1, 140(sp)
+; RV32V-NEXT:    sw s0, 140(sp)
 ; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
+; RV32V-NEXT:    sw t6, 132(sp)
 ; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
+; RV32V-NEXT:    sw t5, 124(sp)
 ; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
+; RV32V-NEXT:    sw t4, 116(sp)
 ; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t4, 108(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
 ; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
 ; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
 ; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
+; RV32V-NEXT:    sw a7, 84(sp)
 ; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
+; RV32V-NEXT:    sw a6, 76(sp)
 ; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
+; RV32V-NEXT:    sw a5, 68(sp)
 ; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
+; RV32V-NEXT:    sw a3, 60(sp)
 ; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a3, 52(sp)
+; RV32V-NEXT:    sw a2, 52(sp)
 ; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a2, 44(sp)
+; RV32V-NEXT:    sw a1, 44(sp)
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a1, 36(sp)
+; RV32V-NEXT:    sw a0, 36(sp)
 ; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw t3, 20(sp)
-; RV32V-NEXT:    addi a0, sp, 272
+; RV32V-NEXT:    sw t3, 28(sp)
+; RV32V-NEXT:    addi a0, sp, 16
 ; RV32V-NEXT:    vlse64.v v16, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 264
+; RV32V-NEXT:    addi a0, sp, 272
 ; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 256
+; RV32V-NEXT:    addi a0, sp, 264
 ; RV32V-NEXT:    vlse64.v v20, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 248
+; RV32V-NEXT:    addi a0, sp, 256
 ; RV32V-NEXT:    vlse64.v v24, (a0), zero
 ; RV32V-NEXT:    vand.vi v4, v12, 2
 ; RV32V-NEXT:    vand.vi v0, v12, 1
 ; RV32V-NEXT:    vmul.vv v4, v8, v4
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
 ; RV32V-NEXT:    vxor.vv v4, v0, v4
 ; RV32V-NEXT:    vand.vi v0, v12, 4
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
@@ -8638,117 +8632,117 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    vand.vx v0, v12, ra
-; RV32V-NEXT:    addi s10, sp, 216
+; RV32V-NEXT:    addi s10, sp, 224
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    vand.vx v0, v12, s9
-; RV32V-NEXT:    addi s7, sp, 208
+; RV32V-NEXT:    addi s7, sp, 216
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    vand.vx v0, v12, s8
-; RV32V-NEXT:    addi s6, sp, 200
+; RV32V-NEXT:    addi s6, sp, 208
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    li a0, 512
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi s5, sp, 192
+; RV32V-NEXT:    addi s5, sp, 200
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    li a0, 1024
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi s4, sp, 184
+; RV32V-NEXT:    addi s4, sp, 192
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    vand.vx v0, v12, s11
-; RV32V-NEXT:    addi s11, sp, 176
+; RV32V-NEXT:    addi s11, sp, 184
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 1
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi s3, sp, 168
+; RV32V-NEXT:    addi s3, sp, 176
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 2
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi s2, sp, 160
+; RV32V-NEXT:    addi s2, sp, 168
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 4
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi s1, sp, 152
+; RV32V-NEXT:    addi s1, sp, 160
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 8
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi s0, sp, 144
+; RV32V-NEXT:    addi s0, sp, 152
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 16
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi t6, sp, 136
+; RV32V-NEXT:    addi t6, sp, 144
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 32
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi t5, sp, 128
+; RV32V-NEXT:    addi t5, sp, 136
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 64
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi t4, sp, 120
+; RV32V-NEXT:    addi t4, sp, 128
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 128
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi t3, sp, 112
+; RV32V-NEXT:    addi t3, sp, 120
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 256
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi t2, sp, 104
+; RV32V-NEXT:    addi t2, sp, 112
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 512
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi t1, sp, 96
+; RV32V-NEXT:    addi t1, sp, 104
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 1024
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi t0, sp, 88
+; RV32V-NEXT:    addi t0, sp, 96
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 2048
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi a7, sp, 80
+; RV32V-NEXT:    addi a7, sp, 88
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 4096
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi a6, sp, 72
+; RV32V-NEXT:    addi a6, sp, 80
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 8192
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi a5, sp, 64
+; RV32V-NEXT:    addi a5, sp, 72
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 16384
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi a4, sp, 56
+; RV32V-NEXT:    addi a4, sp, 64
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    vand.vx v0, v12, a3
-; RV32V-NEXT:    addi a2, sp, 48
+; RV32V-NEXT:    addi a2, sp, 56
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    lui a0, 65536
 ; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    addi a3, sp, 40
+; RV32V-NEXT:    addi a3, sp, 48
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    vand.vx v0, v12, a1
-; RV32V-NEXT:    addi a1, sp, 32
+; RV32V-NEXT:    addi a1, sp, 40
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v4, v4, v0
 ; RV32V-NEXT:    csrr a0, vlenb
@@ -8763,9 +8757,9 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 240
+; RV32V-NEXT:    addi a0, sp, 248
 ; RV32V-NEXT:    vlse64.v v4, (a0), zero
-; RV32V-NEXT:    addi ra, sp, 24
+; RV32V-NEXT:    addi ra, sp, 32
 ; RV32V-NEXT:    vand.vv v16, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 5
@@ -8821,10 +8815,10 @@ define <vscale x 4 x i64> @clmul_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 16
-; RV32V-NEXT:    addi s8, sp, 232
+; RV32V-NEXT:    addi a0, sp, 24
+; RV32V-NEXT:    addi s8, sp, 240
 ; RV32V-NEXT:    vlse64.v v16, (s8), zero
-; RV32V-NEXT:    addi s8, sp, 224
+; RV32V-NEXT:    addi s8, sp, 232
 ; RV32V-NEXT:    vlse64.v v20, (s8), zero
 ; RV32V-NEXT:    vlse64.v v24, (s10), zero
 ; RV32V-NEXT:    vlse64.v v28, (s7), zero
@@ -9748,184 +9742,183 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32V-NEXT:    lui a6, 8192
 ; RV32V-NEXT:    lui a5, 16384
 ; RV32V-NEXT:    lui a4, 32768
-; RV32V-NEXT:    sw a1, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw a1, 16(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s5, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s5, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a3, 260(sp)
+; RV32V-NEXT:    sw a3, 268(sp)
 ; RV32V-NEXT:    lui a3, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a2, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw a2, 260(sp)
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s10, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s10, 252(sp)
 ; RV32V-NEXT:    vsetvli s10, zero, e64, m8, ta, ma
 ; RV32V-NEXT:    vand.vi v24, v16, 2
 ; RV32V-NEXT:    vand.vi v0, v16, 1
 ; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
 ; RV32V-NEXT:    vxor.vv v24, v0, v24
 ; RV32V-NEXT:    vand.vi v0, v16, 4
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vi v0, v16, 8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw a0, 236(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw a0, 244(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a0
-; RV32V-NEXT:    addi s10, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw t6, 228(sp)
+; RV32V-NEXT:    addi s10, sp, 16
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw t6, 236(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t6
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s1, 220(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s1, 228(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s1
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s3, 212(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s3, 220(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s3
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s7, 204(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s7, 212(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s7
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s4, 196(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s4, 204(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s4
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s8, 188(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s8, 196(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s8
 ; RV32V-NEXT:    slli s5, s5, 11
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s5, 180(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s5, 188(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s5
-; RV32V-NEXT:    addi s5, sp, 216
+; RV32V-NEXT:    addi s5, sp, 224
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, ra
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw ra, 172(sp)
-; RV32V-NEXT:    addi ra, sp, 208
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw ra, 180(sp)
+; RV32V-NEXT:    addi ra, sp, 216
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s11
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s11, 164(sp)
-; RV32V-NEXT:    addi s11, sp, 200
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s11, 172(sp)
+; RV32V-NEXT:    addi s11, sp, 208
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s9
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s9, 156(sp)
-; RV32V-NEXT:    addi s9, sp, 192
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw s9, 164(sp)
+; RV32V-NEXT:    addi s9, sp, 200
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s6
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s6, 148(sp)
-; RV32V-NEXT:    addi s6, sp, 184
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s6, 156(sp)
+; RV32V-NEXT:    addi s6, sp, 192
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s2
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s2, 140(sp)
-; RV32V-NEXT:    addi s3, sp, 176
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw s2, 148(sp)
+; RV32V-NEXT:    addi s3, sp, 184
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s0
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
-; RV32V-NEXT:    addi s4, sp, 168
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw s0, 140(sp)
+; RV32V-NEXT:    addi s4, sp, 176
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t5
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t5, 124(sp)
-; RV32V-NEXT:    addi s2, sp, 160
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t5, 132(sp)
+; RV32V-NEXT:    addi s2, sp, 168
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t4
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    addi s1, sp, 152
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t4, 124(sp)
+; RV32V-NEXT:    addi s1, sp, 160
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t3
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    addi t6, sp, 144
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t3, 116(sp)
+; RV32V-NEXT:    addi t6, sp, 152
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t2
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    addi s0, sp, 136
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
+; RV32V-NEXT:    addi s0, sp, 144
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t1
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    addi t5, sp, 128
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
+; RV32V-NEXT:    addi t5, sp, 136
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t0
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    addi t4, sp, 120
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
+; RV32V-NEXT:    addi t4, sp, 128
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a7
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    addi t2, sp, 112
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a7, 84(sp)
+; RV32V-NEXT:    addi t2, sp, 120
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a6
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    addi t3, sp, 104
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a6, 76(sp)
+; RV32V-NEXT:    addi t3, sp, 112
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a5
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    addi t1, sp, 96
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a5, 68(sp)
+; RV32V-NEXT:    addi t1, sp, 104
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a4
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a4, 52(sp)
-; RV32V-NEXT:    addi t0, sp, 88
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a4, 60(sp)
+; RV32V-NEXT:    addi t0, sp, 96
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a3
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a3, 44(sp)
-; RV32V-NEXT:    addi a7, sp, 80
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw a3, 52(sp)
+; RV32V-NEXT:    addi a7, sp, 88
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a2
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw a2, 44(sp)
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a2, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a1, 20(sp)
+; RV32V-NEXT:    sw a0, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw a1, 28(sp)
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    sw t2, 4(sp) # 4-byte Folded Spill
@@ -9938,20 +9931,20 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    vlse64.v v24, (s10), zero
-; RV32V-NEXT:    addi a6, sp, 72
-; RV32V-NEXT:    addi a5, sp, 64
-; RV32V-NEXT:    addi a4, sp, 56
+; RV32V-NEXT:    addi a6, sp, 80
+; RV32V-NEXT:    addi a5, sp, 72
+; RV32V-NEXT:    addi a4, sp, 64
 ; RV32V-NEXT:    vand.vv v24, v16, v24
 ; RV32V-NEXT:    csrr a1, vlenb
 ; RV32V-NEXT:    slli a1, a1, 8
 ; RV32V-NEXT:    add a1, sp, a1
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 48
-; RV32V-NEXT:    addi s10, sp, 40
-; RV32V-NEXT:    addi a3, sp, 32
-; RV32V-NEXT:    addi a2, sp, 24
-; RV32V-NEXT:    addi s7, sp, 264
+; RV32V-NEXT:    addi a1, sp, 56
+; RV32V-NEXT:    addi s10, sp, 48
+; RV32V-NEXT:    addi a3, sp, 40
+; RV32V-NEXT:    addi a2, sp, 32
+; RV32V-NEXT:    addi s7, sp, 272
 ; RV32V-NEXT:    vlse64.v v24, (s7), zero
 ; RV32V-NEXT:    csrr t2, vlenb
 ; RV32V-NEXT:    slli t2, t2, 4
@@ -9965,9 +9958,9 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32V-NEXT:    add t2, sp, t2
 ; RV32V-NEXT:    addi t2, t2, 288
 ; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 256
+; RV32V-NEXT:    addi s7, sp, 264
 ; RV32V-NEXT:    vlse64.v v0, (s7), zero
-; RV32V-NEXT:    addi s7, sp, 248
+; RV32V-NEXT:    addi s7, sp, 256
 ; RV32V-NEXT:    vlse64.v v24, (s7), zero
 ; RV32V-NEXT:    csrr t2, vlenb
 ; RV32V-NEXT:    slli t2, t2, 3
@@ -9983,7 +9976,7 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32V-NEXT:    add t2, sp, t2
 ; RV32V-NEXT:    addi t2, t2, 288
 ; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 240
+; RV32V-NEXT:    addi s7, sp, 248
 ; RV32V-NEXT:    vlse64.v v24, (s7), zero
 ; RV32V-NEXT:    csrr t2, vlenb
 ; RV32V-NEXT:    slli t2, t2, 3
@@ -10087,8 +10080,8 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32V-NEXT:    add t2, sp, t2
 ; RV32V-NEXT:    addi t2, t2, 288
 ; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 16
-; RV32V-NEXT:    addi s8, sp, 232
+; RV32V-NEXT:    addi s7, sp, 24
+; RV32V-NEXT:    addi s8, sp, 240
 ; RV32V-NEXT:    vlse64.v v24, (s8), zero
 ; RV32V-NEXT:    csrr s8, vlenb
 ; RV32V-NEXT:    slli s8, s8, 4
@@ -10101,7 +10094,7 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32V-NEXT:    add s8, sp, s8
 ; RV32V-NEXT:    addi s8, s8, 288
 ; RV32V-NEXT:    vs8r.v v24, (s8) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s8, sp, 224
+; RV32V-NEXT:    addi s8, sp, 232
 ; RV32V-NEXT:    vlse64.v v0, (s8), zero
 ; RV32V-NEXT:    vlse64.v v24, (s5), zero
 ; RV32V-NEXT:    csrr s5, vlenb
@@ -11473,93 +11466,92 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32V-NEXT:    vlse64.v v16, (a0), zero
 ; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    sw a1, 272(sp)
+; RV32V-NEXT:    sw a1, 16(sp)
 ; RV32V-NEXT:    lui t3, 524288
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw ra, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw ra, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a6, 260(sp)
+; RV32V-NEXT:    sw a6, 268(sp)
 ; RV32V-NEXT:    lui a6, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s8, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s8, 260(sp)
 ; RV32V-NEXT:    lui a1, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s11, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s11, 252(sp)
 ; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
 ; RV32V-NEXT:    li a3, 16
-; RV32V-NEXT:    sw a3, 236(sp)
+; RV32V-NEXT:    sw a3, 244(sp)
 ; RV32V-NEXT:    li s8, 16
-; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw zero, 232(sp)
 ; RV32V-NEXT:    li a3, 32
-; RV32V-NEXT:    sw a3, 228(sp)
+; RV32V-NEXT:    sw a3, 236(sp)
 ; RV32V-NEXT:    li a3, 32
-; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
 ; RV32V-NEXT:    li s11, 64
-; RV32V-NEXT:    sw s11, 220(sp)
+; RV32V-NEXT:    sw s11, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s10, 220(sp)
 ; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s10, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s9, 204(sp)
+; RV32V-NEXT:    sw s9, 212(sp)
 ; RV32V-NEXT:    li s9, 256
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s7, 196(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s7, 204(sp)
 ; RV32V-NEXT:    li s7, 512
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s6, 188(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s6, 196(sp)
 ; RV32V-NEXT:    li s6, 1024
 ; RV32V-NEXT:    slli ra, ra, 11
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw ra, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw ra, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s5, 172(sp)
+; RV32V-NEXT:    sw s5, 180(sp)
 ; RV32V-NEXT:    lui s11, 1
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s4, 172(sp)
 ; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s4, 164(sp)
+; RV32V-NEXT:    sw s3, 164(sp)
 ; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s3, 156(sp)
+; RV32V-NEXT:    sw s2, 156(sp)
 ; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s2, 148(sp)
+; RV32V-NEXT:    sw s1, 148(sp)
 ; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s1, 140(sp)
+; RV32V-NEXT:    sw s0, 140(sp)
 ; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
+; RV32V-NEXT:    sw t6, 132(sp)
 ; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
+; RV32V-NEXT:    sw t5, 124(sp)
 ; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
+; RV32V-NEXT:    sw t4, 116(sp)
 ; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t4, 108(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
 ; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
 ; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
 ; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
+; RV32V-NEXT:    sw a7, 84(sp)
 ; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
+; RV32V-NEXT:    sw a5, 76(sp)
 ; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a5, 68(sp)
+; RV32V-NEXT:    sw a4, 68(sp)
 ; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a4, 60(sp)
+; RV32V-NEXT:    sw a2, 60(sp)
 ; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a2, 52(sp)
+; RV32V-NEXT:    sw a6, 52(sp)
 ; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a6, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a1, 36(sp)
+; RV32V-NEXT:    sw a1, 44(sp)
 ; RV32V-NEXT:    lui s10, 131072
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw a0, 36(sp)
 ; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw t3, 20(sp)
-; RV32V-NEXT:    addi a1, sp, 272
+; RV32V-NEXT:    sw t3, 28(sp)
+; RV32V-NEXT:    addi a1, sp, 16
 ; RV32V-NEXT:    vand.vi v24, v16, 2
 ; RV32V-NEXT:    vand.vi v0, v16, 1
 ; RV32V-NEXT:    vmul.vv v24, v8, v24
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
 ; RV32V-NEXT:    vxor.vv v24, v0, v24
 ; RV32V-NEXT:    vand.vi v0, v16, 4
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
@@ -11579,86 +11571,86 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    li a0, 128
 ; RV32V-NEXT:    vand.vx v0, v16, a0
-; RV32V-NEXT:    addi s8, sp, 240
+; RV32V-NEXT:    addi s8, sp, 248
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s9
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s7
-; RV32V-NEXT:    addi s9, sp, 224
+; RV32V-NEXT:    addi s9, sp, 232
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s6
-; RV32V-NEXT:    addi s7, sp, 216
+; RV32V-NEXT:    addi s7, sp, 224
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, ra
-; RV32V-NEXT:    addi ra, sp, 208
+; RV32V-NEXT:    addi ra, sp, 216
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s11
-; RV32V-NEXT:    addi s6, sp, 200
+; RV32V-NEXT:    addi s6, sp, 208
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s4
-; RV32V-NEXT:    addi s5, sp, 192
+; RV32V-NEXT:    addi s5, sp, 200
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s3
-; RV32V-NEXT:    addi s4, sp, 184
+; RV32V-NEXT:    addi s4, sp, 192
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s2
-; RV32V-NEXT:    addi s2, sp, 176
+; RV32V-NEXT:    addi s2, sp, 184
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s1
-; RV32V-NEXT:    addi s3, sp, 168
+; RV32V-NEXT:    addi s3, sp, 176
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, s0
-; RV32V-NEXT:    addi s1, sp, 160
+; RV32V-NEXT:    addi s1, sp, 168
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t6
-; RV32V-NEXT:    addi s0, sp, 152
+; RV32V-NEXT:    addi s0, sp, 160
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t5
-; RV32V-NEXT:    addi t5, sp, 144
+; RV32V-NEXT:    addi t5, sp, 152
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t4
-; RV32V-NEXT:    addi t6, sp, 136
+; RV32V-NEXT:    addi t6, sp, 144
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t2
-; RV32V-NEXT:    addi t4, sp, 128
+; RV32V-NEXT:    addi t4, sp, 136
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t1
-; RV32V-NEXT:    addi t3, sp, 120
+; RV32V-NEXT:    addi t3, sp, 128
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, t0
-; RV32V-NEXT:    addi t1, sp, 112
+; RV32V-NEXT:    addi t1, sp, 120
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a7
-; RV32V-NEXT:    addi t2, sp, 104
+; RV32V-NEXT:    addi t2, sp, 112
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a5
-; RV32V-NEXT:    addi t0, sp, 96
+; RV32V-NEXT:    addi t0, sp, 104
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a4
-; RV32V-NEXT:    addi a7, sp, 88
+; RV32V-NEXT:    addi a7, sp, 96
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    vand.vx v0, v16, a2
-; RV32V-NEXT:    addi a6, sp, 80
+; RV32V-NEXT:    addi a6, sp, 88
 ; RV32V-NEXT:    vmul.vv v0, v8, v0
 ; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    lui a0, 65536
@@ -11678,20 +11670,20 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    vlse64.v v24, (a1), zero
-; RV32V-NEXT:    addi a5, sp, 72
-; RV32V-NEXT:    addi a4, sp, 64
-; RV32V-NEXT:    addi a2, sp, 56
+; RV32V-NEXT:    addi a5, sp, 80
+; RV32V-NEXT:    addi a4, sp, 72
+; RV32V-NEXT:    addi a2, sp, 64
 ; RV32V-NEXT:    vand.vv v24, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 48
-; RV32V-NEXT:    addi s11, sp, 40
-; RV32V-NEXT:    addi a3, sp, 32
-; RV32V-NEXT:    addi a1, sp, 24
-; RV32V-NEXT:    addi s10, sp, 264
+; RV32V-NEXT:    addi a0, sp, 56
+; RV32V-NEXT:    addi s11, sp, 48
+; RV32V-NEXT:    addi a3, sp, 40
+; RV32V-NEXT:    addi a1, sp, 32
+; RV32V-NEXT:    addi s10, sp, 272
 ; RV32V-NEXT:    vlse64.v v24, (s10), zero
 ; RV32V-NEXT:    csrr t1, vlenb
 ; RV32V-NEXT:    slli t1, t1, 4
@@ -11705,9 +11697,9 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    add t1, sp, t1
 ; RV32V-NEXT:    addi t1, t1, 288
 ; RV32V-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s10, sp, 256
+; RV32V-NEXT:    addi s10, sp, 264
 ; RV32V-NEXT:    vlse64.v v0, (s10), zero
-; RV32V-NEXT:    addi s10, sp, 248
+; RV32V-NEXT:    addi s10, sp, 256
 ; RV32V-NEXT:    vlse64.v v24, (s10), zero
 ; RV32V-NEXT:    csrr t1, vlenb
 ; RV32V-NEXT:    slli t1, t1, 3
@@ -11826,8 +11818,8 @@ define <vscale x 8 x i64> @clmul_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) noun
 ; RV32V-NEXT:    add t1, sp, t1
 ; RV32V-NEXT:    addi t1, t1, 288
 ; RV32V-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s8, sp, 16
-; RV32V-NEXT:    addi s10, sp, 232
+; RV32V-NEXT:    addi s8, sp, 24
+; RV32V-NEXT:    addi s10, sp, 240
 ; RV32V-NEXT:    vlse64.v v24, (s10), zero
 ; RV32V-NEXT:    csrr s10, vlenb
 ; RV32V-NEXT:    slli s10, s10, 4
@@ -13231,137 +13223,136 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale
 ; RV32V-NEXT:    lui a6, 8192
 ; RV32V-NEXT:    lui a5, 16384
 ; RV32V-NEXT:    lui a3, 32768
-; RV32V-NEXT:    sw a1, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw a1, 16(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw t5, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw t5, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a4, 260(sp)
+; RV32V-NEXT:    sw a4, 268(sp)
 ; RV32V-NEXT:    lui a4, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a2, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw a2, 260(sp)
 ; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw ra, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw ra, 252(sp)
 ; RV32V-NEXT:    vsetvli ra, zero, e64, m1, ta, mu
 ; RV32V-NEXT:    vand.vi v13, v9, 2
 ; RV32V-NEXT:    vand.vi v14, v9, 1
 ; RV32V-NEXT:    vand.vi v12, v9, 4
 ; RV32V-NEXT:    vand.vi v11, v9, 8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw a0, 236(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw a0, 244(sp)
 ; RV32V-NEXT:    vand.vx v0, v9, a0
-; RV32V-NEXT:    addi ra, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s11, 228(sp)
+; RV32V-NEXT:    addi ra, sp, 16
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw s11, 236(sp)
 ; RV32V-NEXT:    vand.vx v15, v9, s11
-; RV32V-NEXT:    addi s11, sp, 264
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s8, 220(sp)
+; RV32V-NEXT:    addi s11, sp, 272
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s8, 228(sp)
 ; RV32V-NEXT:    vand.vx v16, v9, s8
-; RV32V-NEXT:    addi s8, sp, 256
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s9, 212(sp)
+; RV32V-NEXT:    addi s8, sp, 264
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
 ; RV32V-NEXT:    vand.vx v17, v9, s9
-; RV32V-NEXT:    addi s9, sp, 248
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s10, 204(sp)
+; RV32V-NEXT:    addi s9, sp, 256
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s10, 212(sp)
 ; RV32V-NEXT:    vand.vx v18, v9, s10
-; RV32V-NEXT:    addi s10, sp, 240
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s6, 196(sp)
+; RV32V-NEXT:    addi s10, sp, 248
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s6, 204(sp)
 ; RV32V-NEXT:    vand.vx v19, v9, s6
-; RV32V-NEXT:    addi s6, sp, 232
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s1, 188(sp)
+; RV32V-NEXT:    addi s6, sp, 240
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s1, 196(sp)
 ; RV32V-NEXT:    vand.vx v20, v9, s1
 ; RV32V-NEXT:    slli t5, t5, 11
 ; RV32V-NEXT:    vand.vx v21, v9, s7
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw t5, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw t5, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s7, 172(sp)
+; RV32V-NEXT:    sw s7, 180(sp)
 ; RV32V-NEXT:    vand.vx v22, v9, s5
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s5, 164(sp)
-; RV32V-NEXT:    addi s7, sp, 208
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s5, 172(sp)
+; RV32V-NEXT:    addi s7, sp, 216
 ; RV32V-NEXT:    vand.vx v23, v9, s4
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s4, 156(sp)
-; RV32V-NEXT:    addi s5, sp, 200
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw s4, 164(sp)
+; RV32V-NEXT:    addi s5, sp, 208
 ; RV32V-NEXT:    vand.vx v24, v9, s3
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s3, 148(sp)
-; RV32V-NEXT:    addi s4, sp, 192
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s3, 156(sp)
+; RV32V-NEXT:    addi s4, sp, 200
 ; RV32V-NEXT:    vand.vx v25, v9, s2
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s2, 140(sp)
-; RV32V-NEXT:    addi s3, sp, 184
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw s2, 148(sp)
+; RV32V-NEXT:    addi s3, sp, 192
 ; RV32V-NEXT:    vand.vx v26, v9, s0
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
-; RV32V-NEXT:    addi s2, sp, 176
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw s0, 140(sp)
+; RV32V-NEXT:    addi s2, sp, 184
 ; RV32V-NEXT:    vand.vx v27, v9, t6
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
-; RV32V-NEXT:    addi s1, sp, 168
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t6, 132(sp)
+; RV32V-NEXT:    addi s1, sp, 176
 ; RV32V-NEXT:    vand.vx v28, v9, t4
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    addi s0, sp, 160
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t4, 124(sp)
+; RV32V-NEXT:    addi s0, sp, 168
 ; RV32V-NEXT:    vand.vx v29, v9, t3
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    addi t6, sp, 152
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t3, 116(sp)
+; RV32V-NEXT:    addi t6, sp, 160
 ; RV32V-NEXT:    vand.vx v30, v9, t2
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    addi t4, sp, 144
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
+; RV32V-NEXT:    addi t4, sp, 152
 ; RV32V-NEXT:    vand.vx v31, v9, t1
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    addi t3, sp, 136
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
+; RV32V-NEXT:    addi t3, sp, 144
 ; RV32V-NEXT:    vand.vx v7, v9, t0
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    addi t2, sp, 128
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
+; RV32V-NEXT:    addi t2, sp, 136
 ; RV32V-NEXT:    vand.vx v6, v9, a7
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    addi t1, sp, 120
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a7, 84(sp)
+; RV32V-NEXT:    addi t1, sp, 128
 ; RV32V-NEXT:    vand.vx v5, v9, a6
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    addi t0, sp, 112
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a6, 76(sp)
+; RV32V-NEXT:    addi t0, sp, 120
 ; RV32V-NEXT:    vand.vx v4, v9, a5
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    addi a7, sp, 104
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a5, 68(sp)
+; RV32V-NEXT:    addi a7, sp, 112
 ; RV32V-NEXT:    vand.vx v3, v9, a3
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a3, 52(sp)
-; RV32V-NEXT:    addi a6, sp, 96
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a3, 60(sp)
+; RV32V-NEXT:    addi a6, sp, 104
 ; RV32V-NEXT:    vand.vx v2, v9, a4
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a4, 44(sp)
-; RV32V-NEXT:    addi a5, sp, 88
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw a4, 52(sp)
+; RV32V-NEXT:    addi a5, sp, 96
 ; RV32V-NEXT:    vand.vx v1, v9, a2
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw a2, 44(sp)
+; RV32V-NEXT:    addi a4, sp, 88
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a2, 36(sp)
-; RV32V-NEXT:    addi a4, sp, 80
-; RV32V-NEXT:    sw zero, 24(sp)
 ; RV32V-NEXT:    lui a1, 262144
-; RV32V-NEXT:    sw a1, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    sw a1, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    sw a0, 20(sp)
-; RV32V-NEXT:    addi a3, sp, 72
+; RV32V-NEXT:    sw a0, 28(sp)
+; RV32V-NEXT:    addi a3, sp, 80
 ; RV32V-NEXT:    vmul.vv v10, v8, v13
 ; RV32V-NEXT:    vmul.vv v13, v8, v14
-; RV32V-NEXT:    vxor.vi v13, v13, 0
 ; RV32V-NEXT:    vxor.vv v13, v13, v10
 ; RV32V-NEXT:    vand.vx v10, v9, t5
-; RV32V-NEXT:    addi a2, sp, 64
+; RV32V-NEXT:    addi a2, sp, 72
 ; RV32V-NEXT:    vmul.vv v12, v8, v12
 ; RV32V-NEXT:    vxor.vv v13, v13, v12
 ; RV32V-NEXT:    vlse64.v v12, (ra), zero
@@ -13371,7 +13362,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
 ; RV32V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 56
+; RV32V-NEXT:    addi a0, sp, 64
 ; RV32V-NEXT:    vmul.vv v11, v8, v11
 ; RV32V-NEXT:    vxor.vv v13, v13, v11
 ; RV32V-NEXT:    vlse64.v v11, (s11), zero
@@ -13380,15 +13371,15 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale
 ; RV32V-NEXT:    add t5, sp, t5
 ; RV32V-NEXT:    addi t5, t5, 288
 ; RV32V-NEXT:    vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s11, sp, 48
+; RV32V-NEXT:    addi s11, sp, 56
 ; RV32V-NEXT:    vmul.vv v14, v8, v0
 ; RV32V-NEXT:    vxor.vv v13, v13, v14
 ; RV32V-NEXT:    vlse64.v v0, (s8), zero
-; RV32V-NEXT:    addi ra, sp, 40
+; RV32V-NEXT:    addi ra, sp, 48
 ; RV32V-NEXT:    vmul.vv v14, v8, v15
 ; RV32V-NEXT:    vxor.vv v14, v13, v14
 ; RV32V-NEXT:    vlse64.v v12, (s9), zero
-; RV32V-NEXT:    addi t5, sp, 32
+; RV32V-NEXT:    addi t5, sp, 40
 ; RV32V-NEXT:    vmul.vv v15, v8, v16
 ; RV32V-NEXT:    vxor.vv v15, v14, v15
 ; RV32V-NEXT:    vlse64.v v11, (s10), zero
@@ -13398,7 +13389,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale
 ; RV32V-NEXT:    add s8, sp, s8
 ; RV32V-NEXT:    addi s8, s8, 288
 ; RV32V-NEXT:    vs1r.v v11, (s8) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s8, sp, 24
+; RV32V-NEXT:    addi s8, sp, 32
 ; RV32V-NEXT:    vmul.vv v16, v8, v17
 ; RV32V-NEXT:    vxor.vv v16, v15, v16
 ; RV32V-NEXT:    vlse64.v v11, (s6), zero
@@ -13407,7 +13398,7 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale
 ; RV32V-NEXT:    add s6, sp, s6
 ; RV32V-NEXT:    addi s6, s6, 288
 ; RV32V-NEXT:    vs1r.v v11, (s6) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s6, sp, 16
+; RV32V-NEXT:    addi s6, sp, 24
 ; RV32V-NEXT:    vmul.vv v17, v8, v18
 ; RV32V-NEXT:    vmul.vv v18, v8, v19
 ; RV32V-NEXT:    vmul.vv v19, v8, v20
@@ -13431,14 +13422,14 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale
 ; RV32V-NEXT:    vmul.vv v2, v8, v1
 ; RV32V-NEXT:    vmul.vv v10, v8, v10
 ; RV32V-NEXT:    vxor.vv v16, v16, v17
-; RV32V-NEXT:    addi s9, sp, 224
+; RV32V-NEXT:    addi s9, sp, 232
 ; RV32V-NEXT:    vlse64.v v11, (s9), zero
 ; RV32V-NEXT:    csrr s9, vlenb
 ; RV32V-NEXT:    add s9, sp, s9
 ; RV32V-NEXT:    addi s9, s9, 288
 ; RV32V-NEXT:    vs1r.v v11, (s9) # vscale x 8-byte Folded Spill
 ; RV32V-NEXT:    vxor.vv v16, v16, v18
-; RV32V-NEXT:    addi s9, sp, 216
+; RV32V-NEXT:    addi s9, sp, 224
 ; RV32V-NEXT:    vlse64.v v11, (s9), zero
 ; RV32V-NEXT:    addi s9, sp, 288
 ; RV32V-NEXT:    vs1r.v v11, (s9) # vscale x 8-byte Folded Spill
@@ -14061,157 +14052,156 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b,
 ; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
 ; RV32V-NEXT:    vlse64.v v9, (s3), zero
 ; RV32V-NEXT:    lui s3, 32768
-; RV32V-NEXT:    sw s2, 272(sp)
+; RV32V-NEXT:    sw s2, 16(sp)
 ; RV32V-NEXT:    lui a7, 524288
-; RV32V-NEXT:    sw zero, 276(sp)
+; RV32V-NEXT:    sw zero, 20(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s11, 276(sp)
 ; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s11, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s5, 260(sp)
+; RV32V-NEXT:    sw s5, 268(sp)
 ; RV32V-NEXT:    lui s5, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s7, 252(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s7, 260(sp)
 ; RV32V-NEXT:    lui s7, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s10, 244(sp)
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s10, 252(sp)
 ; RV32V-NEXT:    lui a1, 262144
-; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
 ; RV32V-NEXT:    li s2, 16
-; RV32V-NEXT:    sw s2, 236(sp)
+; RV32V-NEXT:    sw s2, 244(sp)
 ; RV32V-NEXT:    li s10, 16
-; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw zero, 232(sp)
 ; RV32V-NEXT:    li s2, 32
-; RV32V-NEXT:    sw s2, 228(sp)
+; RV32V-NEXT:    sw s2, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw ra, 228(sp)
 ; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw ra, 220(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
 ; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s9, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s8, 204(sp)
+; RV32V-NEXT:    sw s8, 212(sp)
 ; RV32V-NEXT:    li s2, 256
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s6, 204(sp)
 ; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s6, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s4, 188(sp)
+; RV32V-NEXT:    sw s4, 196(sp)
 ; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s11, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s11, 180(sp)
+; RV32V-NEXT:    sw s1, 180(sp)
 ; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s1, 172(sp)
+; RV32V-NEXT:    sw s0, 172(sp)
 ; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s0, 164(sp)
+; RV32V-NEXT:    sw t6, 164(sp)
 ; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t6, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t5, 148(sp)
+; RV32V-NEXT:    sw t5, 156(sp)
 ; RV32V-NEXT:    lui s1, 8
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t4, 140(sp)
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw t4, 148(sp)
 ; RV32V-NEXT:    lui s0, 16
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t3, 132(sp)
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw t3, 140(sp)
 ; RV32V-NEXT:    lui t5, 32
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t2, 124(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t2, 132(sp)
 ; RV32V-NEXT:    lui t4, 64
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t1, 116(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t1, 124(sp)
 ; RV32V-NEXT:    lui t3, 128
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t0, 108(sp)
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t0, 116(sp)
 ; RV32V-NEXT:    lui t2, 256
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw a6, 100(sp)
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw a6, 108(sp)
 ; RV32V-NEXT:    lui t0, 512
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw a5, 92(sp)
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw a5, 100(sp)
 ; RV32V-NEXT:    lui a6, 1024
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a4, 84(sp)
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw a4, 92(sp)
 ; RV32V-NEXT:    lui a5, 2048
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a3, 76(sp)
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a3, 84(sp)
 ; RV32V-NEXT:    lui t1, 4096
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a2, 76(sp)
 ; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a2, 68(sp)
+; RV32V-NEXT:    sw a0, 68(sp)
 ; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a0, 60(sp)
+; RV32V-NEXT:    sw s3, 60(sp)
 ; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw s3, 52(sp)
+; RV32V-NEXT:    sw s5, 52(sp)
 ; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s5, 44(sp)
+; RV32V-NEXT:    sw s7, 44(sp)
 ; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw s7, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a1, 28(sp)
+; RV32V-NEXT:    sw a1, 36(sp)
 ; RV32V-NEXT:    lui a3, 262144
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a7, 20(sp)
-; RV32V-NEXT:    addi a1, sp, 272
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw a7, 28(sp)
+; RV32V-NEXT:    addi a1, sp, 16
 ; RV32V-NEXT:    vlse64.v v12, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 264
+; RV32V-NEXT:    addi a1, sp, 272
 ; RV32V-NEXT:    vlse64.v v3, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 256
+; RV32V-NEXT:    addi a1, sp, 264
 ; RV32V-NEXT:    vlse64.v v10, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 248
+; RV32V-NEXT:    addi a1, sp, 256
 ; RV32V-NEXT:    vlse64.v v15, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 240
+; RV32V-NEXT:    addi a1, sp, 248
 ; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 232
+; RV32V-NEXT:    addi a1, sp, 240
 ; RV32V-NEXT:    vlse64.v v17, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 224
+; RV32V-NEXT:    addi a1, sp, 232
 ; RV32V-NEXT:    vlse64.v v18, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 216
+; RV32V-NEXT:    addi a1, sp, 224
 ; RV32V-NEXT:    vlse64.v v19, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 208
+; RV32V-NEXT:    addi a1, sp, 216
 ; RV32V-NEXT:    vlse64.v v20, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 200
+; RV32V-NEXT:    addi a1, sp, 208
 ; RV32V-NEXT:    vlse64.v v21, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 192
+; RV32V-NEXT:    addi a1, sp, 200
 ; RV32V-NEXT:    vlse64.v v22, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 184
+; RV32V-NEXT:    addi a1, sp, 192
 ; RV32V-NEXT:    vlse64.v v23, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 176
+; RV32V-NEXT:    addi a1, sp, 184
 ; RV32V-NEXT:    vlse64.v v24, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 168
+; RV32V-NEXT:    addi a1, sp, 176
 ; RV32V-NEXT:    vlse64.v v25, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 160
+; RV32V-NEXT:    addi a1, sp, 168
 ; RV32V-NEXT:    vlse64.v v26, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 152
+; RV32V-NEXT:    addi a1, sp, 160
 ; RV32V-NEXT:    vlse64.v v27, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 144
+; RV32V-NEXT:    addi a1, sp, 152
 ; RV32V-NEXT:    vlse64.v v28, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 136
+; RV32V-NEXT:    addi a1, sp, 144
 ; RV32V-NEXT:    vlse64.v v29, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 128
+; RV32V-NEXT:    addi a1, sp, 136
 ; RV32V-NEXT:    vlse64.v v30, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 120
+; RV32V-NEXT:    addi a1, sp, 128
 ; RV32V-NEXT:    vlse64.v v31, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 112
+; RV32V-NEXT:    addi a1, sp, 120
 ; RV32V-NEXT:    vlse64.v v7, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 104
+; RV32V-NEXT:    addi a1, sp, 112
 ; RV32V-NEXT:    vlse64.v v6, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 96
+; RV32V-NEXT:    addi a1, sp, 104
 ; RV32V-NEXT:    vlse64.v v5, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 88
+; RV32V-NEXT:    addi a1, sp, 96
 ; RV32V-NEXT:    vlse64.v v4, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 80
+; RV32V-NEXT:    addi a1, sp, 88
 ; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 72
+; RV32V-NEXT:    addi a1, sp, 80
 ; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    addi a1, sp, 64
+; RV32V-NEXT:    addi a1, sp, 72
 ; RV32V-NEXT:    vlse64.v v13, (a1), zero
 ; RV32V-NEXT:    csrr a1, vlenb
 ; RV32V-NEXT:    slli a1, a1, 2
 ; RV32V-NEXT:    add a1, sp, a1
 ; RV32V-NEXT:    addi a1, a1, 288
 ; RV32V-NEXT:    vs1r.v v13, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi ra, sp, 56
+; RV32V-NEXT:    addi ra, sp, 64
 ; RV32V-NEXT:    vand.vi v2, v9, 2
 ; RV32V-NEXT:    vand.vi v1, v9, 1
 ; RV32V-NEXT:    vmul.vv v2, v8, v2
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
-; RV32V-NEXT:    vxor.vi v1, v1, 0
 ; RV32V-NEXT:    vxor.vv v2, v1, v2
 ; RV32V-NEXT:    vand.vi v1, v9, 4
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
@@ -14220,26 +14210,26 @@ define <vscale x 1 x i64> @clmul_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b,
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vxor.vv v2, v2, v1
 ; RV32V-NEXT:    vand.vx v1, v9, s10
-; RV32V-NEXT:    addi s10, sp, 48
+; RV32V-NEXT:    addi s10, sp, 56
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vxor.vv v2, v2, v1
 ; RV32V-NEXT:    li a1, 32
 ; RV32V-NEXT:    vand.vx v1, v9, a1
-; RV32V-NEXT:    addi s9, sp, 40
+; RV32V-NEXT:    addi s9, sp, 48
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vxor.vv v2, v2, v1
 ; RV32V-NEXT:    li a1, 64
 ; RV32V-NEXT:    vand.vx v1, v9, a1
-; RV32V-NEXT:    addi s8, sp, 32
+; RV32V-NEXT:    addi s8, sp, 40
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vxor.vv v2, v2, v1
 ; RV32V-NEXT:    li a1, 128
 ; RV32V-NEXT:    vand.vx v1, v9, a1
-; RV32V-NEXT:    addi s6, sp, 24
+; RV32V-NEXT:    addi s6, sp, 32
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vxor.vv v2, v2, v1
 ; RV32V-NEXT:    vand.vx v1, v9, s2
-; RV32V-NEXT:    addi s4, sp, 16
+; RV32V-NEXT:    addi s4, sp, 24
 ; RV32V-NEXT:    vmul.vv v1, v8, v1
 ; RV32V-NEXT:    vxor.vv v2, v2, v1
 ; RV32V-NEXT:    li a1, 512
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
index 3b464ba034033..2bb0603838fd6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
@@ -3065,501 +3065,141 @@ define <vscale x 32 x i16> @clmulh_nxv32i16_vx(<vscale x 32 x i16> %va, i16 %b)
 define <vscale x 1 x i32> @clmulh_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb) nounwind {
 ; RV32V-LABEL: clmulh_nxv1i32_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    lui a2, 524288
-; RV32V-NEXT:    li t6, 1
-; RV32V-NEXT:    li a5, 2
-; RV32V-NEXT:    li a3, 4
-; RV32V-NEXT:    li s10, 8
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    li s11, 64
-; RV32V-NEXT:    li ra, 128
-; RV32V-NEXT:    li s9, 256
-; RV32V-NEXT:    li s8, 512
-; RV32V-NEXT:    li s7, 1024
-; RV32V-NEXT:    lui s6, 1
-; RV32V-NEXT:    lui s5, 2
-; RV32V-NEXT:    lui s4, 4
-; RV32V-NEXT:    lui s3, 8
-; RV32V-NEXT:    lui s2, 16
-; RV32V-NEXT:    lui s1, 32
-; RV32V-NEXT:    lui s0, 64
-; RV32V-NEXT:    lui t5, 128
-; RV32V-NEXT:    lui t4, 256
-; RV32V-NEXT:    lui t3, 512
-; RV32V-NEXT:    lui t2, 1024
-; RV32V-NEXT:    lui t1, 2048
-; RV32V-NEXT:    lui t0, 4096
-; RV32V-NEXT:    lui a7, 8192
-; RV32V-NEXT:    lui a6, 16384
-; RV32V-NEXT:    lui a4, 32768
-; RV32V-NEXT:    sw a2, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw t6, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a5, 260(sp)
-; RV32V-NEXT:    lui a5, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a3, 252(sp)
-; RV32V-NEXT:    lui a3, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s10, 244(sp)
-; RV32V-NEXT:    vsetvli s10, zero, e64, m1, ta, ma
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32V-NEXT:    vzext.vf2 v10, v8
 ; RV32V-NEXT:    vzext.vf2 v8, v9
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw a0, 236(sp)
-; RV32V-NEXT:    vand.vx v9, v8, a0
-; RV32V-NEXT:    addi s10, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw a1, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s11, 220(sp)
-; RV32V-NEXT:    vand.vx v11, v8, s11
-; RV32V-NEXT:    addi s11, sp, 264
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw ra, 212(sp)
-; RV32V-NEXT:    vand.vx v12, v8, ra
-; RV32V-NEXT:    addi ra, sp, 256
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s9, 204(sp)
-; RV32V-NEXT:    vand.vx v13, v8, s9
-; RV32V-NEXT:    addi s9, sp, 248
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s8, 196(sp)
-; RV32V-NEXT:    vand.vx v14, v8, s8
-; RV32V-NEXT:    addi s8, sp, 240
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s7, 188(sp)
-; RV32V-NEXT:    vand.vx v15, v8, s7
-; RV32V-NEXT:    addi s7, sp, 232
-; RV32V-NEXT:    slli t6, t6, 11
-; RV32V-NEXT:    vand.vx v16, v8, s6
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw t6, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s6, 172(sp)
-; RV32V-NEXT:    addi s6, sp, 224
-; RV32V-NEXT:    vand.vx v17, v8, s5
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s5, 164(sp)
-; RV32V-NEXT:    addi s5, sp, 216
-; RV32V-NEXT:    vand.vx v18, v8, s4
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s4, 156(sp)
-; RV32V-NEXT:    addi s4, sp, 208
-; RV32V-NEXT:    vand.vx v19, v8, s3
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s3, 148(sp)
-; RV32V-NEXT:    addi s3, sp, 200
-; RV32V-NEXT:    vand.vx v20, v8, s2
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s2, 140(sp)
-; RV32V-NEXT:    addi s2, sp, 192
-; RV32V-NEXT:    vand.vx v21, v8, s1
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s1, 132(sp)
-; RV32V-NEXT:    addi s1, sp, 184
-; RV32V-NEXT:    vand.vx v22, v8, s0
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw s0, 124(sp)
-; RV32V-NEXT:    addi s0, sp, 176
-; RV32V-NEXT:    vand.vx v23, v8, t5
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
-; RV32V-NEXT:    addi t5, sp, 168
-; RV32V-NEXT:    vand.vx v24, v8, t4
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t4, 108(sp)
-; RV32V-NEXT:    addi t4, sp, 160
-; RV32V-NEXT:    vand.vx v25, v8, t3
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t3, 100(sp)
-; RV32V-NEXT:    addi t3, sp, 152
-; RV32V-NEXT:    vand.vx v26, v8, t2
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t2, 92(sp)
-; RV32V-NEXT:    addi t2, sp, 144
-; RV32V-NEXT:    vand.vx v27, v8, t1
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t1, 84(sp)
-; RV32V-NEXT:    addi t1, sp, 136
-; RV32V-NEXT:    vand.vx v28, v8, t0
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw t0, 76(sp)
-; RV32V-NEXT:    addi t0, sp, 128
-; RV32V-NEXT:    vand.vx v29, v8, a7
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a7, 68(sp)
-; RV32V-NEXT:    addi a7, sp, 120
-; RV32V-NEXT:    vand.vx v31, v8, a6
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a6, 60(sp)
-; RV32V-NEXT:    addi a6, sp, 112
-; RV32V-NEXT:    vand.vx v6, v8, a4
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a4, 52(sp)
-; RV32V-NEXT:    addi a4, sp, 104
-; RV32V-NEXT:    vand.vx v4, v8, a5
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a5, 44(sp)
-; RV32V-NEXT:    addi a5, sp, 96
-; RV32V-NEXT:    vand.vx v3, v8, a3
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a3, 36(sp)
-; RV32V-NEXT:    addi a3, sp, 88
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a2, 20(sp)
-; RV32V-NEXT:    addi a1, sp, 80
-; RV32V-NEXT:    vand.vx v1, v8, t6
-; RV32V-NEXT:    addi t6, sp, 72
-; RV32V-NEXT:    vlse64.v v30, (s10), zero
-; RV32V-NEXT:    addi s10, sp, 64
-; RV32V-NEXT:    vlse64.v v7, (s11), zero
-; RV32V-NEXT:    addi s11, sp, 56
-; RV32V-NEXT:    vlse64.v v5, (ra), zero
-; RV32V-NEXT:    addi ra, sp, 48
-; RV32V-NEXT:    vand.vi v2, v8, 2
-; RV32V-NEXT:    vand.vi v0, v8, 1
-; RV32V-NEXT:    vmul.vv v2, v10, v2
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
-; RV32V-NEXT:    vxor.vv v2, v0, v2
-; RV32V-NEXT:    vand.vi v0, v8, 4
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v2, v2, v0
-; RV32V-NEXT:    vand.vi v0, v8, 8
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v0, v2, v0
-; RV32V-NEXT:    vlse64.v v2, (s9), zero
-; RV32V-NEXT:    addi a2, sp, 40
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vand.vx v9, v8, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vand.vi v11, v8, 2
+; RV32V-NEXT:    vand.vi v12, v8, 1
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v11, v12, v11
+; RV32V-NEXT:    vand.vi v12, v8, 4
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vi v12, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    li a2, 128
 ; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v9, v0, v9
-; RV32V-NEXT:    li s9, 32
-; RV32V-NEXT:    vand.vx v0, v8, s9
-; RV32V-NEXT:    li s9, 32
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v0, v9, v0
-; RV32V-NEXT:    vlse64.v v9, (s8), zero
-; RV32V-NEXT:    csrr s8, vlenb
-; RV32V-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    slli a0, s8, 2
-; RV32V-NEXT:    add s8, a0, s8
-; RV32V-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    add s8, sp, s8
-; RV32V-NEXT:    addi s8, s8, 288
-; RV32V-NEXT:    vs1r.v v9, (s8) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s8, sp, 32
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vx v11, v8, a1
 ; RV32V-NEXT:    vmul.vv v11, v10, v11
-; RV32V-NEXT:    vxor.vv v0, v0, v11
-; RV32V-NEXT:    vlse64.v v9, (s7), zero
-; RV32V-NEXT:    addi s7, sp, 24
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    li a2, 256
 ; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v0, v0, v12
-; RV32V-NEXT:    vlse64.v v11, (s6), zero
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs1r.v v11, (s6) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s6, sp, 16
-; RV32V-NEXT:    vmul.vv v13, v10, v13
-; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vmul.vv v15, v10, v15
-; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vmul.vv v17, v10, v17
-; RV32V-NEXT:    vmul.vv v18, v10, v18
-; RV32V-NEXT:    vmul.vv v19, v10, v19
-; RV32V-NEXT:    vmul.vv v20, v10, v20
-; RV32V-NEXT:    vmul.vv v21, v10, v21
-; RV32V-NEXT:    vmul.vv v22, v10, v22
-; RV32V-NEXT:    vmul.vv v23, v10, v23
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vmul.vv v25, v10, v25
-; RV32V-NEXT:    vmul.vv v26, v10, v26
-; RV32V-NEXT:    vmul.vv v27, v10, v27
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vmul.vv v29, v10, v29
-; RV32V-NEXT:    vmul.vv v31, v10, v31
-; RV32V-NEXT:    vmul.vv v6, v10, v6
-; RV32V-NEXT:    vmul.vv v4, v10, v4
-; RV32V-NEXT:    vmul.vv v3, v10, v3
-; RV32V-NEXT:    vmul.vv v1, v10, v1
-; RV32V-NEXT:    vxor.vv v13, v0, v13
-; RV32V-NEXT:    vlse64.v v0, (s5), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v14
-; RV32V-NEXT:    vlse64.v v11, (s4), zero
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s5, s4, 1
-; RV32V-NEXT:    add s4, s5, s4
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs1r.v v11, (s4) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vxor.vv v13, v13, v15
-; RV32V-NEXT:    vlse64.v v11, (s3), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v1
-; RV32V-NEXT:    vlse64.v v12, (s2), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v16
-; RV32V-NEXT:    vlse64.v v16, (s1), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v17
-; RV32V-NEXT:    vlse64.v v17, (s0), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v18
-; RV32V-NEXT:    vlse64.v v18, (t5), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v19
-; RV32V-NEXT:    vlse64.v v19, (t4), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v20
-; RV32V-NEXT:    vlse64.v v20, (t3), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v21
-; RV32V-NEXT:    vlse64.v v21, (t2), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v22
-; RV32V-NEXT:    vlse64.v v22, (t1), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v23
-; RV32V-NEXT:    vlse64.v v23, (t0), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v24
-; RV32V-NEXT:    vlse64.v v24, (a7), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v25
-; RV32V-NEXT:    vlse64.v v25, (a6), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v26
-; RV32V-NEXT:    vlse64.v v26, (a4), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v27
-; RV32V-NEXT:    vlse64.v v27, (a5), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v28
-; RV32V-NEXT:    vlse64.v v28, (a3), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v29
-; RV32V-NEXT:    vlse64.v v29, (a1), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v31
-; RV32V-NEXT:    vlse64.v v31, (t6), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v6
-; RV32V-NEXT:    vlse64.v v6, (s10), zero
-; RV32V-NEXT:    vxor.vv v13, v13, v4
-; RV32V-NEXT:    vlse64.v v4, (s11), zero
-; RV32V-NEXT:    vxor.vv v3, v13, v3
-; RV32V-NEXT:    vlse64.v v13, (ra), zero
-; RV32V-NEXT:    vand.vv v30, v8, v30
-; RV32V-NEXT:    vand.vv v7, v8, v7
-; RV32V-NEXT:    vand.vv v5, v8, v5
-; RV32V-NEXT:    vand.vv v2, v8, v2
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a3, a1, 2
-; RV32V-NEXT:    add a1, a3, a1
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl1r.v v14, (a1) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v14, v8, v14
-; RV32V-NEXT:    vand.vv v15, v8, v9
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v1, v8, v9
-; RV32V-NEXT:    vand.vv v0, v8, v0
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a3, a1, 1
-; RV32V-NEXT:    add a1, a3, a1
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v9, v8, v9
-; RV32V-NEXT:    addi a1, sp, 288
-; RV32V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v11
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v12
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v16
-; RV32V-NEXT:    vand.vv v17, v8, v17
-; RV32V-NEXT:    vand.vv v18, v8, v18
-; RV32V-NEXT:    vand.vv v19, v8, v19
-; RV32V-NEXT:    vand.vv v20, v8, v20
-; RV32V-NEXT:    vand.vv v21, v8, v21
-; RV32V-NEXT:    vand.vv v22, v8, v22
-; RV32V-NEXT:    vand.vv v23, v8, v23
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    vand.vv v25, v8, v25
-; RV32V-NEXT:    vand.vv v26, v8, v26
-; RV32V-NEXT:    vand.vv v27, v8, v27
-; RV32V-NEXT:    vand.vv v28, v8, v28
-; RV32V-NEXT:    vand.vv v29, v8, v29
-; RV32V-NEXT:    vand.vv v31, v8, v31
-; RV32V-NEXT:    vand.vv v9, v8, v6
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a3, a1, 1
-; RV32V-NEXT:    add a1, a3, a1
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v4, v8, v4
-; RV32V-NEXT:    vand.vv v9, v8, v13
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a3, a1, 2
-; RV32V-NEXT:    add a1, a3, a1
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v13, (a2), zero
-; RV32V-NEXT:    vlse64.v v9, (s8), zero
-; RV32V-NEXT:    vlse64.v v11, (s7), zero
-; RV32V-NEXT:    vlse64.v v12, (s6), zero
-; RV32V-NEXT:    vand.vv v6, v8, v13
-; RV32V-NEXT:    vand.vv v13, v8, v9
-; RV32V-NEXT:    vand.vv v9, v8, v11
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v11, v8, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    li a2, 512
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vlse64.v v11, (a2), zero
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vxor.vv v9, v9, v12
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vand.vv v11, v8, v11
 ; RV32V-NEXT:    vand.vx v8, v8, a0
 ; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v3, v8
-; RV32V-NEXT:    vmul.vv v9, v10, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v7
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v5
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v15
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v1
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v17
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v19
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v21
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v23
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v25
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v27
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v29
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v31
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vmul.vv v9, v10, v13
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v10, v9
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v9, v8
 ; RV32V-NEXT:    vmul.vv v9, v10, v11
 ; RV32V-NEXT:    vxor.vv v8, v8, v9
 ; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32V-NEXT:    vnsrl.wx v8, v8, s9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    vnsrl.wx v8, v8, a1
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: clmulh_nxv1i32_vv:
@@ -3727,488 +3367,143 @@ define <vscale x 1 x i32> @clmulh_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1
 define <vscale x 1 x i32> @clmulh_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nounwind {
 ; RV32V-LABEL: clmulh_nxv1i32_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    sub sp, sp, a1
+; RV32V-NEXT:    addi sp, sp, -16
 ; RV32V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV32V-NEXT:    vmv.v.x v10, a0
-; RV32V-NEXT:    lui ra, 524288
-; RV32V-NEXT:    li s11, 1
-; RV32V-NEXT:    li a5, 2
-; RV32V-NEXT:    li a3, 4
-; RV32V-NEXT:    li a0, 8
-; RV32V-NEXT:    li s9, 16
-; RV32V-NEXT:    li a1, 32
-; RV32V-NEXT:    li s10, 64
-; RV32V-NEXT:    li s8, 128
-; RV32V-NEXT:    li s7, 256
-; RV32V-NEXT:    li s6, 512
-; RV32V-NEXT:    li s5, 1024
-; RV32V-NEXT:    lui s4, 1
-; RV32V-NEXT:    lui s3, 2
-; RV32V-NEXT:    lui s2, 4
-; RV32V-NEXT:    lui s1, 8
-; RV32V-NEXT:    lui s0, 16
-; RV32V-NEXT:    lui t6, 32
-; RV32V-NEXT:    lui t5, 64
-; RV32V-NEXT:    lui t4, 128
-; RV32V-NEXT:    lui t3, 256
-; RV32V-NEXT:    lui t2, 512
-; RV32V-NEXT:    lui t1, 1024
-; RV32V-NEXT:    lui t0, 2048
-; RV32V-NEXT:    lui a7, 4096
-; RV32V-NEXT:    lui a6, 8192
-; RV32V-NEXT:    lui a4, 16384
-; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    sw ra, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s11, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a5, 260(sp)
-; RV32V-NEXT:    lui a5, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a3, 252(sp)
-; RV32V-NEXT:    lui a3, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a0, 244(sp)
-; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    lui a0, 524288
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; RV32V-NEXT:    vzext.vf2 v9, v8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s9, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw a1, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s10, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s8, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s7, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s6, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s5, 188(sp)
-; RV32V-NEXT:    slli s11, s11, 11
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s11, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s4, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s3, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s2, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s0, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t6, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t5, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a4, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a2, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a5, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a3, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw ra, 20(sp)
-; RV32V-NEXT:    addi ra, sp, 272
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
 ; RV32V-NEXT:    vzext.vf2 v8, v10
-; RV32V-NEXT:    vand.vx v14, v8, s9
-; RV32V-NEXT:    addi a0, sp, 264
-; RV32V-NEXT:    vand.vx v12, v8, s10
-; RV32V-NEXT:    addi s10, sp, 256
-; RV32V-NEXT:    vand.vx v11, v8, s8
-; RV32V-NEXT:    addi s9, sp, 248
-; RV32V-NEXT:    vand.vx v10, v8, s7
-; RV32V-NEXT:    addi s8, sp, 240
-; RV32V-NEXT:    vand.vx v13, v8, s6
-; RV32V-NEXT:    addi s7, sp, 232
-; RV32V-NEXT:    vand.vx v15, v8, s5
-; RV32V-NEXT:    addi s6, sp, 224
-; RV32V-NEXT:    vand.vx v16, v8, s11
-; RV32V-NEXT:    addi s5, sp, 216
-; RV32V-NEXT:    vand.vx v17, v8, s4
-; RV32V-NEXT:    addi s4, sp, 208
-; RV32V-NEXT:    vand.vx v18, v8, s3
-; RV32V-NEXT:    addi s3, sp, 200
-; RV32V-NEXT:    vand.vx v19, v8, s2
-; RV32V-NEXT:    addi s2, sp, 192
-; RV32V-NEXT:    vand.vx v20, v8, s1
-; RV32V-NEXT:    addi s1, sp, 184
-; RV32V-NEXT:    vand.vx v21, v8, s0
-; RV32V-NEXT:    addi s0, sp, 176
-; RV32V-NEXT:    vand.vx v22, v8, t6
-; RV32V-NEXT:    addi t6, sp, 168
-; RV32V-NEXT:    vand.vx v23, v8, t5
-; RV32V-NEXT:    addi t5, sp, 160
-; RV32V-NEXT:    vand.vx v24, v8, t4
-; RV32V-NEXT:    addi t4, sp, 152
-; RV32V-NEXT:    vand.vx v25, v8, t3
-; RV32V-NEXT:    addi t3, sp, 144
-; RV32V-NEXT:    vand.vx v26, v8, t2
-; RV32V-NEXT:    addi t2, sp, 136
-; RV32V-NEXT:    vand.vx v28, v8, t1
-; RV32V-NEXT:    addi t1, sp, 128
-; RV32V-NEXT:    vand.vx v30, v8, t0
-; RV32V-NEXT:    addi t0, sp, 120
-; RV32V-NEXT:    vand.vx v7, v8, a7
-; RV32V-NEXT:    addi a7, sp, 112
-; RV32V-NEXT:    vand.vx v6, v8, a6
-; RV32V-NEXT:    addi a6, sp, 104
-; RV32V-NEXT:    vand.vx v5, v8, a4
-; RV32V-NEXT:    addi a4, sp, 96
-; RV32V-NEXT:    vand.vx v3, v8, a2
-; RV32V-NEXT:    addi a2, sp, 88
-; RV32V-NEXT:    vand.vx v2, v8, a5
-; RV32V-NEXT:    addi a5, sp, 80
-; RV32V-NEXT:    vand.vx v1, v8, a3
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    vlse64.v v27, (ra), zero
-; RV32V-NEXT:    addi s11, sp, 64
-; RV32V-NEXT:    vlse64.v v29, (a0), zero
-; RV32V-NEXT:    addi ra, sp, 56
-; RV32V-NEXT:    vlse64.v v31, (s10), zero
-; RV32V-NEXT:    addi s10, sp, 48
-; RV32V-NEXT:    vand.vi v4, v8, 2
-; RV32V-NEXT:    vand.vi v0, v8, 1
-; RV32V-NEXT:    vmul.vv v4, v9, v4
-; RV32V-NEXT:    vmul.vv v0, v9, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
-; RV32V-NEXT:    vxor.vv v4, v0, v4
-; RV32V-NEXT:    vand.vi v0, v8, 4
-; RV32V-NEXT:    vmul.vv v0, v9, v0
-; RV32V-NEXT:    vxor.vv v4, v4, v0
-; RV32V-NEXT:    vand.vi v0, v8, 8
-; RV32V-NEXT:    vmul.vv v0, v9, v0
-; RV32V-NEXT:    vxor.vv v0, v4, v0
-; RV32V-NEXT:    vlse64.v v4, (s9), zero
-; RV32V-NEXT:    addi a0, sp, 40
-; RV32V-NEXT:    vmul.vv v14, v9, v14
-; RV32V-NEXT:    vxor.vv v14, v0, v14
-; RV32V-NEXT:    vand.vx v0, v8, a1
-; RV32V-NEXT:    vmul.vv v0, v9, v0
-; RV32V-NEXT:    vxor.vv v0, v14, v0
-; RV32V-NEXT:    vlse64.v v14, (s8), zero
-; RV32V-NEXT:    addi s8, sp, 32
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vxor.vv v0, v0, v12
-; RV32V-NEXT:    vlse64.v v12, (s7), zero
-; RV32V-NEXT:    csrr s7, vlenb
-; RV32V-NEXT:    slli s9, s7, 2
-; RV32V-NEXT:    add s7, s9, s7
-; RV32V-NEXT:    add s7, sp, s7
-; RV32V-NEXT:    addi s7, s7, 288
-; RV32V-NEXT:    vs1r.v v12, (s7) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 24
+; RV32V-NEXT:    vand.vx v10, v8, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vand.vi v11, v8, 2
+; RV32V-NEXT:    vand.vi v12, v8, 1
 ; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v0, v0, v11
-; RV32V-NEXT:    vlse64.v v12, (s6), zero
-; RV32V-NEXT:    addi s6, sp, 16
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v11, v12, v11
+; RV32V-NEXT:    vand.vi v12, v8, 4
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vi v12, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v11, v11, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    li a2, 128
 ; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    vmul.vv v13, v9, v13
-; RV32V-NEXT:    vmul.vv v15, v9, v15
-; RV32V-NEXT:    vmul.vv v16, v9, v16
-; RV32V-NEXT:    vmul.vv v17, v9, v17
-; RV32V-NEXT:    vmul.vv v18, v9, v18
-; RV32V-NEXT:    vmul.vv v19, v9, v19
-; RV32V-NEXT:    vmul.vv v20, v9, v20
-; RV32V-NEXT:    vmul.vv v21, v9, v21
-; RV32V-NEXT:    vmul.vv v22, v9, v22
-; RV32V-NEXT:    vmul.vv v23, v9, v23
-; RV32V-NEXT:    vmul.vv v24, v9, v24
-; RV32V-NEXT:    vmul.vv v25, v9, v25
-; RV32V-NEXT:    vmul.vv v26, v9, v26
-; RV32V-NEXT:    vmul.vv v28, v9, v28
-; RV32V-NEXT:    vmul.vv v30, v9, v30
-; RV32V-NEXT:    vmul.vv v7, v9, v7
-; RV32V-NEXT:    vmul.vv v6, v9, v6
-; RV32V-NEXT:    vmul.vv v5, v9, v5
-; RV32V-NEXT:    vmul.vv v3, v9, v3
-; RV32V-NEXT:    vmul.vv v2, v9, v2
-; RV32V-NEXT:    vmul.vv v1, v9, v1
-; RV32V-NEXT:    vxor.vv v10, v0, v10
-; RV32V-NEXT:    vlse64.v v0, (s5), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v13
-; RV32V-NEXT:    vlse64.v v11, (s4), zero
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs1r.v v11, (s4) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vxor.vv v10, v10, v15
-; RV32V-NEXT:    vlse64.v v11, (s3), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v16
-; RV32V-NEXT:    vlse64.v v13, (s2), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v17
-; RV32V-NEXT:    vlse64.v v17, (s1), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v18
-; RV32V-NEXT:    vlse64.v v18, (s0), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v19
-; RV32V-NEXT:    vlse64.v v19, (t6), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v20
-; RV32V-NEXT:    vlse64.v v20, (t5), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v21
-; RV32V-NEXT:    vlse64.v v21, (t4), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v22
-; RV32V-NEXT:    vlse64.v v22, (t3), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v23
-; RV32V-NEXT:    vlse64.v v23, (t2), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v24
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v25
-; RV32V-NEXT:    vlse64.v v25, (t0), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v26
-; RV32V-NEXT:    vlse64.v v26, (a7), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v28
-; RV32V-NEXT:    vlse64.v v28, (a6), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v30
-; RV32V-NEXT:    vlse64.v v30, (a4), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v7
-; RV32V-NEXT:    vlse64.v v7, (a2), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v6
-; RV32V-NEXT:    vlse64.v v6, (a5), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v5
-; RV32V-NEXT:    vlse64.v v5, (a3), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v3
-; RV32V-NEXT:    vlse64.v v3, (s11), zero
-; RV32V-NEXT:    vxor.vv v10, v10, v2
-; RV32V-NEXT:    vlse64.v v2, (ra), zero
-; RV32V-NEXT:    vxor.vv v1, v10, v1
-; RV32V-NEXT:    vlse64.v v10, (s10), zero
-; RV32V-NEXT:    vand.vv v27, v8, v27
-; RV32V-NEXT:    vand.vv v29, v8, v29
-; RV32V-NEXT:    vand.vv v31, v8, v31
-; RV32V-NEXT:    vand.vv v4, v8, v4
-; RV32V-NEXT:    vand.vv v14, v8, v14
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a3, a2, 2
-; RV32V-NEXT:    add a2, a3, a2
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vl1r.v v15, (a2) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v15, v8, v15
-; RV32V-NEXT:    vand.vv v16, v8, v12
-; RV32V-NEXT:    vand.vv v0, v8, v0
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 2
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    addi a2, sp, 288
-; RV32V-NEXT:    vs1r.v v12, (a2) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v11, v8, v11
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs1r.v v11, (a2) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v11, v8, v13
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs1r.v v11, (a2) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v17, v8, v17
-; RV32V-NEXT:    vand.vv v18, v8, v18
-; RV32V-NEXT:    vand.vv v19, v8, v19
-; RV32V-NEXT:    vand.vv v20, v8, v20
-; RV32V-NEXT:    vand.vv v21, v8, v21
-; RV32V-NEXT:    vand.vv v22, v8, v22
-; RV32V-NEXT:    vand.vv v23, v8, v23
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    vand.vv v25, v8, v25
-; RV32V-NEXT:    vand.vv v26, v8, v26
-; RV32V-NEXT:    vand.vv v28, v8, v28
-; RV32V-NEXT:    vand.vv v30, v8, v30
-; RV32V-NEXT:    vand.vv v7, v8, v7
-; RV32V-NEXT:    vand.vv v6, v8, v6
-; RV32V-NEXT:    vand.vv v11, v8, v5
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a3, a2, 1
-; RV32V-NEXT:    add a2, a3, a2
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs1r.v v11, (a2) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v3, v8, v3
-; RV32V-NEXT:    vand.vv v2, v8, v2
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a3, a2, 2
-; RV32V-NEXT:    add a2, a3, a2
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs1r.v v10, (a2) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v10, (a0), zero
-; RV32V-NEXT:    vlse64.v v11, (s8), zero
-; RV32V-NEXT:    vlse64.v v12, (s7), zero
-; RV32V-NEXT:    vlse64.v v13, (s6), zero
-; RV32V-NEXT:    vand.vv v5, v8, v10
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    vand.vv v11, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v13
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    vxor.vv v8, v1, v8
-; RV32V-NEXT:    vmul.vv v11, v9, v27
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v29
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v31
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v15
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vx v11, v8, a1
 ; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    li a2, 256
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    li a2, 512
 ; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
 ; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v17
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v19
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v21
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v23
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v25
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v7
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a2, a0, 1
-; RV32V-NEXT:    add a0, a2, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 2
 ; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v3
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a2, a0, 2
-; RV32V-NEXT:    add a0, a2, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 8
 ; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v11, v9, v5
-; RV32V-NEXT:    vxor.vv v8, v8, v11
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    vmul.vv v9, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vmul.vv v12, v9, v12
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vlse64.v v12, (a2), zero
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vand.vv v11, v8, v12
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vmul.vv v9, v9, v11
 ; RV32V-NEXT:    vxor.vv v8, v8, v9
 ; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32V-NEXT:    vnsrl.wx v8, v8, a1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: clmulh_nxv1i32_vx:
@@ -4384,810 +3679,141 @@ define <vscale x 1 x i32> @clmulh_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nou
 define <vscale x 2 x i32> @clmulh_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) nounwind {
 ; RV32V-LABEL: clmulh_nxv2i32_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    lui a2, 524288
-; RV32V-NEXT:    li s0, 1
-; RV32V-NEXT:    li a4, 2
-; RV32V-NEXT:    li a3, 4
-; RV32V-NEXT:    li a1, 8
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    li s6, 64
-; RV32V-NEXT:    li s5, 128
-; RV32V-NEXT:    li s1, 256
-; RV32V-NEXT:    li s2, 512
-; RV32V-NEXT:    li s3, 1024
-; RV32V-NEXT:    lui s7, 1
-; RV32V-NEXT:    lui ra, 2
-; RV32V-NEXT:    lui s8, 4
-; RV32V-NEXT:    lui s10, 8
-; RV32V-NEXT:    lui s11, 16
-; RV32V-NEXT:    lui s9, 32
-; RV32V-NEXT:    lui t6, 64
-; RV32V-NEXT:    lui t5, 128
-; RV32V-NEXT:    lui t4, 256
-; RV32V-NEXT:    lui t3, 512
-; RV32V-NEXT:    lui t2, 1024
-; RV32V-NEXT:    lui t1, 2048
-; RV32V-NEXT:    lui t0, 4096
-; RV32V-NEXT:    lui a7, 8192
-; RV32V-NEXT:    lui a6, 16384
-; RV32V-NEXT:    lui a5, 32768
-; RV32V-NEXT:    sw a2, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s0, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a4, 260(sp)
-; RV32V-NEXT:    lui a4, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a3, 252(sp)
-; RV32V-NEXT:    lui a3, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a1, 244(sp)
-; RV32V-NEXT:    vsetvli s4, zero, e64, m2, ta, ma
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32V-NEXT:    vzext.vf2 v10, v8
 ; RV32V-NEXT:    vzext.vf2 v12, v9
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw a0, 236(sp)
-; RV32V-NEXT:    vand.vx v8, v12, a0
-; RV32V-NEXT:    addi s4, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    sw a0, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s6, 220(sp)
-; RV32V-NEXT:    vand.vx v28, v12, s6
-; RV32V-NEXT:    addi s6, sp, 264
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s5, 212(sp)
-; RV32V-NEXT:    vand.vx v30, v12, s5
-; RV32V-NEXT:    addi s5, sp, 256
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s1, 204(sp)
-; RV32V-NEXT:    vand.vx v6, v12, s1
-; RV32V-NEXT:    addi s1, sp, 248
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s2, 196(sp)
-; RV32V-NEXT:    vand.vx v4, v12, s2
-; RV32V-NEXT:    addi s2, sp, 240
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s3, 188(sp)
-; RV32V-NEXT:    vand.vx v2, v12, s3
-; RV32V-NEXT:    addi s3, sp, 232
-; RV32V-NEXT:    slli s0, s0, 11
-; RV32V-NEXT:    vand.vx v24, v12, s7
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s0, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s7, 172(sp)
-; RV32V-NEXT:    addi s7, sp, 224
-; RV32V-NEXT:    vand.vx v26, v12, ra
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw ra, 164(sp)
-; RV32V-NEXT:    addi a1, sp, 216
-; RV32V-NEXT:    vand.vx v22, v12, s8
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s8, 156(sp)
-; RV32V-NEXT:    addi s8, sp, 208
-; RV32V-NEXT:    vand.vx v20, v12, s10
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s10, 148(sp)
-; RV32V-NEXT:    addi s10, sp, 200
-; RV32V-NEXT:    vand.vx v16, v12, s11
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s11, 140(sp)
-; RV32V-NEXT:    addi s11, sp, 192
-; RV32V-NEXT:    vand.vx v14, v12, s9
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s9, 132(sp)
-; RV32V-NEXT:    addi s9, sp, 184
-; RV32V-NEXT:    vand.vi v18, v12, 2
-; RV32V-NEXT:    vand.vi v0, v12, 1
-; RV32V-NEXT:    vmul.vv v18, v10, v18
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
-; RV32V-NEXT:    vxor.vv v18, v0, v18
-; RV32V-NEXT:    vand.vi v0, v12, 4
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v18, v18, v0
-; RV32V-NEXT:    vand.vi v0, v12, 8
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v0, v18, v0
-; RV32V-NEXT:    vand.vx v18, v12, t6
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
-; RV32V-NEXT:    addi t6, sp, 176
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vand.vx v8, v12, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vand.vi v14, v12, 2
+; RV32V-NEXT:    vand.vi v16, v12, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v16, v14
+; RV32V-NEXT:    vand.vi v16, v12, 4
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vi v16, v12, 8
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    li a2, 128
 ; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v0, v8
-; RV32V-NEXT:    vand.vx v0, v12, a0
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v0, v8, v0
-; RV32V-NEXT:    vand.vx v8, v12, t5
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
-; RV32V-NEXT:    addi t5, sp, 168
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vxor.vv v0, v0, v28
-; RV32V-NEXT:    vand.vx v28, v12, t4
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t4, 108(sp)
-; RV32V-NEXT:    addi t4, sp, 160
-; RV32V-NEXT:    vmul.vv v30, v10, v30
-; RV32V-NEXT:    vxor.vv v0, v0, v30
-; RV32V-NEXT:    vand.vx v30, v12, t3
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t3, 100(sp)
-; RV32V-NEXT:    addi t3, sp, 152
-; RV32V-NEXT:    vmul.vv v6, v10, v6
-; RV32V-NEXT:    vxor.vv v0, v0, v6
-; RV32V-NEXT:    vand.vx v6, v12, t2
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t2, 92(sp)
-; RV32V-NEXT:    addi t2, sp, 144
-; RV32V-NEXT:    vmul.vv v4, v10, v4
-; RV32V-NEXT:    vxor.vv v0, v0, v4
-; RV32V-NEXT:    vand.vx v4, v12, t1
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t1, 84(sp)
-; RV32V-NEXT:    addi t1, sp, 136
-; RV32V-NEXT:    vmul.vv v2, v10, v2
-; RV32V-NEXT:    vxor.vv v2, v0, v2
-; RV32V-NEXT:    vand.vx v0, v12, s0
-; RV32V-NEXT:    addi ra, sp, 128
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v0, v2, v0
-; RV32V-NEXT:    vand.vx v2, v12, t0
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw t0, 76(sp)
-; RV32V-NEXT:    addi t0, sp, 120
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vand.vx v24, v12, a7
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a7, 68(sp)
-; RV32V-NEXT:    addi a7, sp, 112
-; RV32V-NEXT:    vmul.vv v26, v10, v26
-; RV32V-NEXT:    vxor.vv v0, v0, v26
-; RV32V-NEXT:    vand.vx v26, v12, a6
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a6, 60(sp)
-; RV32V-NEXT:    addi a6, sp, 104
-; RV32V-NEXT:    vmul.vv v22, v10, v22
-; RV32V-NEXT:    vxor.vv v0, v0, v22
-; RV32V-NEXT:    vand.vx v22, v12, a5
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a5, 52(sp)
-; RV32V-NEXT:    addi a5, sp, 96
-; RV32V-NEXT:    vmul.vv v20, v10, v20
-; RV32V-NEXT:    vxor.vv v0, v0, v20
-; RV32V-NEXT:    vand.vx v20, v12, a4
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a4, 44(sp)
-; RV32V-NEXT:    addi a4, sp, 88
+; RV32V-NEXT:    vxor.vv v8, v14, v8
+; RV32V-NEXT:    vand.vx v14, v12, a1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    li a2, 256
 ; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v0, v0, v16
-; RV32V-NEXT:    vand.vx v16, v12, a3
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a3, 36(sp)
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a2, 20(sp)
-; RV32V-NEXT:    addi a2, sp, 72
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    li a2, 512
 ; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v0, v0, v14
-; RV32V-NEXT:    vlse64.v v14, (s4), zero
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    mv s4, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s4, s4, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s0, s0, s4
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs2r.v v14, (s0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s0, sp, 64
-; RV32V-NEXT:    vmul.vv v18, v10, v18
-; RV32V-NEXT:    vxor.vv v0, v0, v18
-; RV32V-NEXT:    vlse64.v v14, (s6), zero
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 3
-; RV32V-NEXT:    mv s6, s4
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    add s4, s4, s6
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs2r.v v14, (s4) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s4, sp, 56
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v0, v8
-; RV32V-NEXT:    vlse64.v v14, (s5), zero
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    mv s6, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s6, s6, s5
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    add s5, s5, s6
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vs2r.v v14, (s5) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s5, sp, 48
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vlse64.v v0, (s1), zero
-; RV32V-NEXT:    addi s1, sp, 40
-; RV32V-NEXT:    vmul.vv v30, v10, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v30
-; RV32V-NEXT:    vlse64.v v30, (s2), zero
-; RV32V-NEXT:    addi s2, sp, 32
-; RV32V-NEXT:    vmul.vv v6, v10, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v6
-; RV32V-NEXT:    vlse64.v v6, (s3), zero
-; RV32V-NEXT:    addi s3, sp, 24
-; RV32V-NEXT:    vmul.vv v4, v10, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vlse64.v v4, (s7), zero
-; RV32V-NEXT:    addi s6, sp, 16
-; RV32V-NEXT:    vmul.vv v2, v10, v2
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vmul.vv v26, v10, v26
-; RV32V-NEXT:    vmul.vv v22, v10, v22
-; RV32V-NEXT:    vmul.vv v20, v10, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    li a2, 1024
 ; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v2
-; RV32V-NEXT:    vlse64.v v2, (a1), zero
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vlse64.v v28, (s8), zero
-; RV32V-NEXT:    vxor.vv v8, v8, v26
-; RV32V-NEXT:    vlse64.v v24, (s10), zero
-; RV32V-NEXT:    vxor.vv v8, v8, v22
-; RV32V-NEXT:    vlse64.v v18, (s11), zero
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vlse64.v v14, (s9), zero
-; RV32V-NEXT:    vxor.vv v22, v8, v16
-; RV32V-NEXT:    vlse64.v v8, (t6), zero
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv t6, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add t6, t6, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, t6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v16, v12, v16
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv t6, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, t6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl2r.v v20, (a1) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v20, v12, v20
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv t6, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add t6, t6, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, t6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl2r.v v26, (a1) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v26, v12, v26
-; RV32V-NEXT:    vand.vv v0, v12, v0
-; RV32V-NEXT:    vand.vv v30, v12, v30
-; RV32V-NEXT:    vand.vv v6, v12, v6
-; RV32V-NEXT:    vand.vv v4, v12, v4
-; RV32V-NEXT:    vand.vv v2, v12, v2
-; RV32V-NEXT:    vand.vv v28, v12, v28
-; RV32V-NEXT:    vand.vv v24, v12, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v24, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v18, v12, v18
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv t6, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, t6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v18, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v12, v14
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 5
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v14, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv t6, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add t6, t6, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, t6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (t5), zero
-; RV32V-NEXT:    vlse64.v v18, (t4), zero
-; RV32V-NEXT:    vlse64.v v24, (t3), zero
-; RV32V-NEXT:    vlse64.v v14, (t2), zero
-; RV32V-NEXT:    vand.vv v8, v12, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv t2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, t2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v18
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv t2, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, t2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv t2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add t2, t2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add t2, t2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, t2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v14
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv t2, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, t2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v14, (t1), zero
-; RV32V-NEXT:    vlse64.v v18, (ra), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    vlse64.v v8, (a7), zero
-; RV32V-NEXT:    vand.vv v14, v12, v14
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v14, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v12, v18
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v14, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v12, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a7, a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a7
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v14, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a7, a7, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, a7
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (a6), zero
-; RV32V-NEXT:    vlse64.v v14, (a5), zero
-; RV32V-NEXT:    vlse64.v v18, (a4), zero
-; RV32V-NEXT:    vlse64.v v24, (a3), zero
-; RV32V-NEXT:    vand.vv v8, v12, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v14
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv a3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a3, a3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v18
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv a3, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a3, a3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv a3, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, a3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (a2), zero
-; RV32V-NEXT:    vlse64.v v14, (s0), zero
-; RV32V-NEXT:    vlse64.v v24, (s4), zero
-; RV32V-NEXT:    vlse64.v v18, (s5), zero
-; RV32V-NEXT:    vand.vv v8, v12, v8
-; RV32V-NEXT:    addi a1, sp, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v14
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v18
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v14, (s1), zero
-; RV32V-NEXT:    vlse64.v v18, (s2), zero
-; RV32V-NEXT:    vlse64.v v8, (s3), zero
-; RV32V-NEXT:    vlse64.v v24, (s6), zero
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vand.vx v16, v12, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vlse64.v v14, (a2), zero
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    lui a0, 262144
 ; RV32V-NEXT:    vand.vv v14, v12, v14
-; RV32V-NEXT:    vand.vv v18, v12, v18
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v18, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v12, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v18, v12, v24
-; RV32V-NEXT:    vand.vx v8, v12, a0
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v22, v8
-; RV32V-NEXT:    vmul.vv v12, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vx v12, v12, a0
 ; RV32V-NEXT:    vmul.vv v12, v10, v12
 ; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v10, v10, v18
+; RV32V-NEXT:    vmul.vv v10, v10, v14
 ; RV32V-NEXT:    vxor.vv v10, v8, v10
 ; RV32V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vnsrl.wx v8, v10, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    vnsrl.wx v8, v10, a1
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: clmulh_nxv2i32_vv:
@@ -5355,793 +3981,143 @@ define <vscale x 2 x i32> @clmulh_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2
 define <vscale x 2 x i32> @clmulh_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nounwind {
 ; RV32V-LABEL: clmulh_nxv2i32_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    sub sp, sp, a1
+; RV32V-NEXT:    addi sp, sp, -16
 ; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32V-NEXT:    vmv.v.x v12, a0
-; RV32V-NEXT:    lui s11, 524288
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    li a4, 2
-; RV32V-NEXT:    li a2, 4
-; RV32V-NEXT:    li a1, 8
-; RV32V-NEXT:    li s9, 16
-; RV32V-NEXT:    li ra, 32
-; RV32V-NEXT:    li s10, 64
-; RV32V-NEXT:    li s8, 128
-; RV32V-NEXT:    li s7, 256
-; RV32V-NEXT:    li s6, 512
-; RV32V-NEXT:    li s5, 1024
-; RV32V-NEXT:    lui s4, 1
-; RV32V-NEXT:    lui s3, 2
-; RV32V-NEXT:    lui s2, 4
-; RV32V-NEXT:    lui s1, 8
-; RV32V-NEXT:    lui s0, 16
-; RV32V-NEXT:    lui t6, 32
-; RV32V-NEXT:    lui t5, 64
-; RV32V-NEXT:    lui t4, 128
-; RV32V-NEXT:    lui t3, 256
-; RV32V-NEXT:    lui t2, 512
-; RV32V-NEXT:    lui t1, 1024
-; RV32V-NEXT:    lui t0, 2048
-; RV32V-NEXT:    lui a7, 4096
-; RV32V-NEXT:    lui a6, 8192
-; RV32V-NEXT:    lui a5, 16384
-; RV32V-NEXT:    lui a3, 32768
-; RV32V-NEXT:    sw s11, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw a0, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a4, 260(sp)
-; RV32V-NEXT:    lui a4, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a2, 252(sp)
-; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a1, 244(sp)
-; RV32V-NEXT:    lui a1, 262144
+; RV32V-NEXT:    lui a0, 524288
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV32V-NEXT:    vzext.vf2 v10, v8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s9, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw ra, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s10, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s8, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s7, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s6, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s5, 188(sp)
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a0, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s4, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s3, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s2, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s0, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t6, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t5, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a3, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a4, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a2, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a1, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw s11, 20(sp)
-; RV32V-NEXT:    addi ra, sp, 272
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
 ; RV32V-NEXT:    vzext.vf2 v8, v12
-; RV32V-NEXT:    vand.vx v28, v8, s9
-; RV32V-NEXT:    addi s11, sp, 264
-; RV32V-NEXT:    vand.vx v30, v8, s10
-; RV32V-NEXT:    addi s10, sp, 256
-; RV32V-NEXT:    vand.vx v6, v8, s8
-; RV32V-NEXT:    addi s9, sp, 248
-; RV32V-NEXT:    vand.vx v4, v8, s7
-; RV32V-NEXT:    addi s8, sp, 240
-; RV32V-NEXT:    vand.vx v2, v8, s6
-; RV32V-NEXT:    addi s7, sp, 232
-; RV32V-NEXT:    vand.vx v26, v8, s5
-; RV32V-NEXT:    addi s6, sp, 224
-; RV32V-NEXT:    vand.vx v22, v8, a0
-; RV32V-NEXT:    addi s5, sp, 216
-; RV32V-NEXT:    vand.vx v20, v8, s4
-; RV32V-NEXT:    addi s4, sp, 208
-; RV32V-NEXT:    vand.vx v18, v8, s3
-; RV32V-NEXT:    addi s3, sp, 200
-; RV32V-NEXT:    vand.vx v16, v8, s2
-; RV32V-NEXT:    addi s2, sp, 192
-; RV32V-NEXT:    vand.vx v14, v8, s1
-; RV32V-NEXT:    addi s1, sp, 184
-; RV32V-NEXT:    vand.vx v0, v8, s0
-; RV32V-NEXT:    addi s0, sp, 176
-; RV32V-NEXT:    vand.vi v12, v8, 2
-; RV32V-NEXT:    vand.vi v24, v8, 1
+; RV32V-NEXT:    vand.vx v12, v8, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vand.vi v14, v8, 2
+; RV32V-NEXT:    vand.vi v16, v8, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v16, v14
+; RV32V-NEXT:    vand.vi v16, v8, 4
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vi v16, v8, 8
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    li a2, 128
 ; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vxor.vi v24, v24, 0
-; RV32V-NEXT:    vxor.vv v12, v24, v12
-; RV32V-NEXT:    vand.vi v24, v8, 4
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vxor.vv v12, v12, v24
-; RV32V-NEXT:    vand.vi v24, v8, 8
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vxor.vv v12, v12, v24
-; RV32V-NEXT:    vand.vx v24, v8, t6
-; RV32V-NEXT:    addi t6, sp, 168
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vxor.vv v12, v12, v28
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v28, v8, a0
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vxor.vv v12, v12, v28
-; RV32V-NEXT:    vand.vx v28, v8, t5
-; RV32V-NEXT:    addi t5, sp, 160
-; RV32V-NEXT:    vmul.vv v30, v10, v30
-; RV32V-NEXT:    vxor.vv v12, v12, v30
-; RV32V-NEXT:    vand.vx v30, v8, t4
-; RV32V-NEXT:    addi t4, sp, 152
-; RV32V-NEXT:    vmul.vv v6, v10, v6
-; RV32V-NEXT:    vxor.vv v12, v12, v6
-; RV32V-NEXT:    vand.vx v6, v8, t3
-; RV32V-NEXT:    addi t3, sp, 144
-; RV32V-NEXT:    vmul.vv v4, v10, v4
-; RV32V-NEXT:    vxor.vv v12, v12, v4
-; RV32V-NEXT:    vand.vx v4, v8, t2
-; RV32V-NEXT:    addi a0, sp, 136
-; RV32V-NEXT:    vmul.vv v2, v10, v2
-; RV32V-NEXT:    vxor.vv v12, v12, v2
-; RV32V-NEXT:    vand.vx v2, v8, t1
-; RV32V-NEXT:    addi t1, sp, 128
-; RV32V-NEXT:    vmul.vv v26, v10, v26
-; RV32V-NEXT:    vxor.vv v12, v12, v26
-; RV32V-NEXT:    vand.vx v26, v8, t0
-; RV32V-NEXT:    addi t0, sp, 120
-; RV32V-NEXT:    vmul.vv v22, v10, v22
-; RV32V-NEXT:    vxor.vv v12, v12, v22
-; RV32V-NEXT:    vand.vx v22, v8, a7
-; RV32V-NEXT:    addi a7, sp, 112
-; RV32V-NEXT:    vmul.vv v20, v10, v20
-; RV32V-NEXT:    vxor.vv v12, v12, v20
-; RV32V-NEXT:    vand.vx v20, v8, a6
-; RV32V-NEXT:    addi a6, sp, 104
-; RV32V-NEXT:    vmul.vv v18, v10, v18
-; RV32V-NEXT:    vxor.vv v12, v12, v18
-; RV32V-NEXT:    vand.vx v18, v8, a5
-; RV32V-NEXT:    addi a5, sp, 96
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vx v14, v8, a1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    li a2, 256
 ; RV32V-NEXT:    vmul.vv v16, v10, v16
 ; RV32V-NEXT:    vxor.vv v12, v12, v16
-; RV32V-NEXT:    vand.vx v16, v8, a3
-; RV32V-NEXT:    addi a3, sp, 88
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    li a2, 512
 ; RV32V-NEXT:    vmul.vv v14, v10, v14
-; RV32V-NEXT:    vxor.vv v14, v12, v14
-; RV32V-NEXT:    vand.vx v12, v8, a4
-; RV32V-NEXT:    addi a4, sp, 80
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v14, v14, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
-; RV32V-NEXT:    addi a2, sp, 72
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vxor.vv v24, v14, v24
-; RV32V-NEXT:    vlse64.v v14, (ra), zero
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    mv ra, t2
-; RV32V-NEXT:    slli t2, t2, 2
-; RV32V-NEXT:    add ra, ra, t2
-; RV32V-NEXT:    slli t2, t2, 2
-; RV32V-NEXT:    add t2, t2, ra
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs2r.v v14, (t2) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi t2, sp, 64
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vxor.vv v28, v24, v28
-; RV32V-NEXT:    vlse64.v v14, (s11), zero
-; RV32V-NEXT:    csrr s11, vlenb
-; RV32V-NEXT:    slli s11, s11, 3
-; RV32V-NEXT:    mv ra, s11
-; RV32V-NEXT:    slli s11, s11, 2
-; RV32V-NEXT:    add s11, s11, ra
-; RV32V-NEXT:    add s11, sp, s11
-; RV32V-NEXT:    addi s11, s11, 288
-; RV32V-NEXT:    vs2r.v v14, (s11) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi s11, sp, 56
-; RV32V-NEXT:    vmul.vv v30, v10, v30
-; RV32V-NEXT:    vxor.vv v30, v28, v30
-; RV32V-NEXT:    vlse64.v v28, (s10), zero
-; RV32V-NEXT:    addi s10, sp, 48
-; RV32V-NEXT:    vmul.vv v6, v10, v6
-; RV32V-NEXT:    vxor.vv v6, v30, v6
-; RV32V-NEXT:    vlse64.v v30, (s9), zero
-; RV32V-NEXT:    addi s9, sp, 40
-; RV32V-NEXT:    vmul.vv v4, v10, v4
-; RV32V-NEXT:    vxor.vv v4, v6, v4
-; RV32V-NEXT:    vlse64.v v6, (s8), zero
-; RV32V-NEXT:    addi s8, sp, 32
-; RV32V-NEXT:    vmul.vv v2, v10, v2
-; RV32V-NEXT:    vxor.vv v2, v4, v2
-; RV32V-NEXT:    vlse64.v v4, (s7), zero
-; RV32V-NEXT:    addi s7, sp, 24
-; RV32V-NEXT:    vmul.vv v26, v10, v26
-; RV32V-NEXT:    vxor.vv v26, v2, v26
-; RV32V-NEXT:    vlse64.v v2, (s6), zero
-; RV32V-NEXT:    addi s6, sp, 16
-; RV32V-NEXT:    vmul.vv v22, v10, v22
-; RV32V-NEXT:    vmul.vv v20, v10, v20
-; RV32V-NEXT:    vmul.vv v18, v10, v18
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    li a2, 1024
 ; RV32V-NEXT:    vmul.vv v16, v10, v16
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    vxor.vv v22, v26, v22
-; RV32V-NEXT:    vlse64.v v24, (s5), zero
-; RV32V-NEXT:    vxor.vv v20, v22, v20
-; RV32V-NEXT:    vlse64.v v22, (s4), zero
-; RV32V-NEXT:    vxor.vv v18, v20, v18
-; RV32V-NEXT:    vlse64.v v20, (s3), zero
-; RV32V-NEXT:    vxor.vv v16, v18, v16
-; RV32V-NEXT:    vlse64.v v18, (s2), zero
-; RV32V-NEXT:    vxor.vv v12, v16, v12
-; RV32V-NEXT:    vlse64.v v14, (s1), zero
-; RV32V-NEXT:    vxor.vv v26, v12, v0
-; RV32V-NEXT:    vlse64.v v12, (s0), zero
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    mv s1, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s1, s1, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s0, s0, s1
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vl2r.v v16, (s0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v16, v8, v16
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 3
-; RV32V-NEXT:    mv s1, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s0, s0, s1
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vl2r.v v0, (s0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v0, v8, v0
-; RV32V-NEXT:    vand.vv v28, v8, v28
-; RV32V-NEXT:    vand.vv v30, v8, v30
-; RV32V-NEXT:    vand.vv v6, v8, v6
-; RV32V-NEXT:    vand.vv v4, v8, v4
-; RV32V-NEXT:    vand.vv v2, v8, v2
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    vand.vv v22, v8, v22
-; RV32V-NEXT:    vand.vv v20, v8, v20
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 3
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs2r.v v20, (s0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v18, v8, v18
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    mv s1, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s0, s0, s1
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs2r.v v18, (s0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v14
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 5
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs2r.v v14, (s0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    mv s1, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s1, s1, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s0, s0, s1
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs2r.v v12, (s0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v12, (t6), zero
-; RV32V-NEXT:    vlse64.v v18, (t5), zero
-; RV32V-NEXT:    vlse64.v v20, (t4), zero
-; RV32V-NEXT:    vlse64.v v14, (t3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    mv t4, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t3, t3, t4
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs2r.v v12, (t3) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    mv t4, t3
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    add t3, t3, t4
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs2r.v v12, (t3) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    mv t4, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t4, t4, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t4, t4, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t3, t3, t4
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs2r.v v12, (t3) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t4, t3
-; RV32V-NEXT:    slli t3, t3, 2
-; RV32V-NEXT:    add t3, t3, t4
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs2r.v v12, (t3) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v14, (a0), zero
-; RV32V-NEXT:    vlse64.v v18, (t1), zero
-; RV32V-NEXT:    vlse64.v v20, (t0), zero
-; RV32V-NEXT:    vlse64.v v12, (a7), zero
-; RV32V-NEXT:    vand.vv v14, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v12, (a6), zero
-; RV32V-NEXT:    vlse64.v v14, (a5), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vlse64.v v20, (a4), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a3, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a3, a3, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a3, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a3, a3, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a3, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v12, (a2), zero
-; RV32V-NEXT:    vlse64.v v14, (t2), zero
-; RV32V-NEXT:    vlse64.v v20, (s11), zero
-; RV32V-NEXT:    vlse64.v v18, (s10), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a2, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a2, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a2, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v14, (s9), zero
-; RV32V-NEXT:    vlse64.v v18, (s8), zero
-; RV32V-NEXT:    vlse64.v v12, (s7), zero
-; RV32V-NEXT:    vlse64.v v20, (s6), zero
-; RV32V-NEXT:    vand.vv v14, v8, v14
-; RV32V-NEXT:    vand.vv v18, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a2, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v18, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a2, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a2, a2, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v18, v8, v20
-; RV32V-NEXT:    vand.vx v8, v8, a1
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vmul.vv v16, v10, v16
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vand.vv v14, v8, v16
+; RV32V-NEXT:    vand.vx v8, v8, a0
 ; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v26, v8
-; RV32V-NEXT:    vmul.vv v12, v10, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v12, v10, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vmul.vv v10, v10, v18
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vmul.vv v10, v10, v14
 ; RV32V-NEXT:    vxor.vv v10, v8, v10
 ; RV32V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vnsrl.wx v8, v10, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    vnsrl.wx v8, v10, a1
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: clmulh_nxv2i32_vx:
@@ -6317,1127 +4293,609 @@ define <vscale x 2 x i32> @clmulh_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nou
 define <vscale x 4 x i32> @clmulh_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb) nounwind {
 ; RV32V-LABEL: clmulh_nxv4i32_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    addi sp, sp, -16
 ; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    li s7, 1
-; RV32V-NEXT:    li a4, 2
-; RV32V-NEXT:    li a3, 4
-; RV32V-NEXT:    li a1, 8
-; RV32V-NEXT:    li s3, 16
-; RV32V-NEXT:    li s4, 64
-; RV32V-NEXT:    li s5, 128
-; RV32V-NEXT:    li s6, 256
-; RV32V-NEXT:    li s9, 512
-; RV32V-NEXT:    li s2, 1024
-; RV32V-NEXT:    lui ra, 1
-; RV32V-NEXT:    lui s11, 2
-; RV32V-NEXT:    lui s10, 4
-; RV32V-NEXT:    lui s8, 8
-; RV32V-NEXT:    lui s1, 16
-; RV32V-NEXT:    lui s0, 32
-; RV32V-NEXT:    lui t6, 64
-; RV32V-NEXT:    lui t5, 128
-; RV32V-NEXT:    lui t4, 256
-; RV32V-NEXT:    lui t3, 512
-; RV32V-NEXT:    lui t2, 1024
-; RV32V-NEXT:    lui t1, 2048
-; RV32V-NEXT:    lui t0, 4096
-; RV32V-NEXT:    lui a7, 8192
-; RV32V-NEXT:    lui a6, 16384
-; RV32V-NEXT:    lui a5, 32768
-; RV32V-NEXT:    sw a0, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s7, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a4, 260(sp)
-; RV32V-NEXT:    lui a4, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a3, 252(sp)
-; RV32V-NEXT:    lui a3, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a1, 244(sp)
-; RV32V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32V-NEXT:    vzext.vf2 v12, v8
 ; RV32V-NEXT:    vzext.vf2 v16, v10
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s3, 236(sp)
-; RV32V-NEXT:    vand.vx v8, v16, s3
-; RV32V-NEXT:    addi s3, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    sw a0, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s4, 220(sp)
-; RV32V-NEXT:    vand.vx v28, v16, s4
-; RV32V-NEXT:    addi s4, sp, 264
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s5, 212(sp)
-; RV32V-NEXT:    vand.vx v24, v16, s5
-; RV32V-NEXT:    addi s5, sp, 256
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s6, 204(sp)
-; RV32V-NEXT:    vand.vx v20, v16, s6
-; RV32V-NEXT:    addi s6, sp, 248
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s9, 196(sp)
-; RV32V-NEXT:    vand.vi v4, v16, 2
-; RV32V-NEXT:    vand.vi v0, v16, 1
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
-; RV32V-NEXT:    vxor.vv v4, v0, v4
-; RV32V-NEXT:    vand.vi v0, v16, 4
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v4, v4, v0
-; RV32V-NEXT:    vand.vi v0, v16, 8
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v0, v4, v0
-; RV32V-NEXT:    vand.vx v4, v16, s9
-; RV32V-NEXT:    addi s9, sp, 240
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s2, 188(sp)
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vand.vx v8, v16, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vand.vi v20, v16, 2
+; RV32V-NEXT:    vand.vi v24, v16, 1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v24, v20
+; RV32V-NEXT:    vand.vi v24, v16, 4
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vi v24, v16, 8
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    li a2, 128
 ; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    vxor.vv v8, v0, v8
-; RV32V-NEXT:    vand.vx v0, v16, a0
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v16, s2
-; RV32V-NEXT:    slli a0, s7, 11
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v28, v8, v28
-; RV32V-NEXT:    vand.vx v8, v16, ra
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a0, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw ra, 172(sp)
+; RV32V-NEXT:    vxor.vv v8, v20, v8
+; RV32V-NEXT:    vand.vx v20, v16, a1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    li a2, 256
 ; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v28, v28, v24
-; RV32V-NEXT:    vand.vx v24, v16, s11
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s11, 164(sp)
-; RV32V-NEXT:    addi s11, sp, 216
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    li a2, 512
 ; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v28, v28, v20
-; RV32V-NEXT:    vand.vx v20, v16, s10
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s10, 156(sp)
-; RV32V-NEXT:    addi s10, sp, 208
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v4, v28, v4
-; RV32V-NEXT:    vand.vx v28, v16, s8
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s8, 148(sp)
-; RV32V-NEXT:    addi s8, sp, 200
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v4, v4, v0
-; RV32V-NEXT:    vand.vx v0, v16, a0
-; RV32V-NEXT:    addi ra, sp, 192
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v0, v4, v0
-; RV32V-NEXT:    vand.vx v4, v16, s1
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s1, 140(sp)
-; RV32V-NEXT:    addi s2, sp, 184
-; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    vxor.vv v0, v0, v8
-; RV32V-NEXT:    vand.vx v8, v16, s0
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
-; RV32V-NEXT:    addi s1, sp, 176
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    li a2, 1024
 ; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vand.vx v24, v16, t6
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
-; RV32V-NEXT:    addi s0, sp, 168
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 1
 ; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v0, v0, v20
-; RV32V-NEXT:    vand.vx v20, v16, t5
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
-; RV32V-NEXT:    addi t6, sp, 160
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v0, v0, v28
-; RV32V-NEXT:    vand.vx v28, v16, t4
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t4, 108(sp)
-; RV32V-NEXT:    addi t5, sp, 152
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v0, v0, v4
-; RV32V-NEXT:    vand.vx v4, v16, t3
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t3, 100(sp)
-; RV32V-NEXT:    addi t4, sp, 144
-; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    vxor.vv v8, v0, v8
-; RV32V-NEXT:    vand.vx v0, v16, t2
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t2, 92(sp)
-; RV32V-NEXT:    addi t3, sp, 136
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
 ; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v24, v8, v24
-; RV32V-NEXT:    vand.vx v8, v16, t1
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t1, 84(sp)
-; RV32V-NEXT:    addi t2, sp, 128
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 4
 ; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v24, v24, v20
-; RV32V-NEXT:    vand.vx v20, v16, t0
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw t0, 76(sp)
-; RV32V-NEXT:    addi t1, sp, 120
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v24, v24, v28
-; RV32V-NEXT:    vand.vx v28, v16, a7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a7, 68(sp)
-; RV32V-NEXT:    addi t0, sp, 112
-; RV32V-NEXT:    vmul.vv v28, v12, v4
-; RV32V-NEXT:    vxor.vv v24, v24, v28
-; RV32V-NEXT:    vand.vx v28, v16, a6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a6, 60(sp)
-; RV32V-NEXT:    addi a7, sp, 104
-; RV32V-NEXT:    vmul.vv v28, v12, v0
-; RV32V-NEXT:    vxor.vv v28, v24, v28
-; RV32V-NEXT:    vand.vx v24, v16, a5
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a5, 52(sp)
-; RV32V-NEXT:    addi a6, sp, 96
-; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    vxor.vv v8, v28, v8
-; RV32V-NEXT:    vand.vx v28, v16, a4
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a4, 44(sp)
-; RV32V-NEXT:    addi a5, sp, 88
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 16
 ; RV32V-NEXT:    vmul.vv v20, v12, v20
 ; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vand.vx v4, v16, a3
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a3, 36(sp)
-; RV32V-NEXT:    addi a4, sp, 80
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    lui a2, 262144
-; RV32V-NEXT:    sw a2, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    sw a0, 20(sp)
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 64
 ; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v20, v8, v20
-; RV32V-NEXT:    vlse64.v v8, (s3), zero
-; RV32V-NEXT:    addi s3, sp, 64
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v0, v20, v0
-; RV32V-NEXT:    vlse64.v v20, (s4), zero
-; RV32V-NEXT:    addi s4, sp, 56
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 128
 ; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vlse64.v v24, (s5), zero
-; RV32V-NEXT:    addi s5, sp, 48
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v0, v0, v28
-; RV32V-NEXT:    vlse64.v v28, (s6), zero
-; RV32V-NEXT:    addi s6, sp, 40
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v4, v0, v4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v4, (s9), zero
-; RV32V-NEXT:    addi s9, sp, 32
-; RV32V-NEXT:    vand.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 24
-; RV32V-NEXT:    addi a0, sp, 16
-; RV32V-NEXT:    addi s7, sp, 232
-; RV32V-NEXT:    vlse64.v v8, (s7), zero
-; RV32V-NEXT:    addi s7, sp, 224
-; RV32V-NEXT:    vlse64.v v20, (s7), zero
-; RV32V-NEXT:    vlse64.v v24, (s11), zero
-; RV32V-NEXT:    vlse64.v v28, (s10), zero
-; RV32V-NEXT:    vand.vv v8, v16, v8
-; RV32V-NEXT:    csrr s7, vlenb
-; RV32V-NEXT:    slli s7, s7, 4
-; RV32V-NEXT:    add s7, sp, s7
-; RV32V-NEXT:    addi s7, s7, 288
-; RV32V-NEXT:    vs4r.v v8, (s7) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v20
-; RV32V-NEXT:    csrr s7, vlenb
-; RV32V-NEXT:    slli s7, s7, 2
-; RV32V-NEXT:    mv s10, s7
-; RV32V-NEXT:    slli s7, s7, 1
-; RV32V-NEXT:    add s10, s10, s7
-; RV32V-NEXT:    slli s7, s7, 2
-; RV32V-NEXT:    add s7, s7, s10
-; RV32V-NEXT:    add s7, sp, s7
-; RV32V-NEXT:    addi s7, s7, 288
-; RV32V-NEXT:    vs4r.v v8, (s7) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v24
-; RV32V-NEXT:    csrr s7, vlenb
-; RV32V-NEXT:    slli s7, s7, 2
-; RV32V-NEXT:    mv s10, s7
-; RV32V-NEXT:    slli s7, s7, 4
-; RV32V-NEXT:    add s7, s7, s10
-; RV32V-NEXT:    add s7, sp, s7
-; RV32V-NEXT:    addi s7, s7, 288
-; RV32V-NEXT:    vs4r.v v8, (s7) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v28
-; RV32V-NEXT:    csrr s7, vlenb
-; RV32V-NEXT:    slli s7, s7, 2
-; RV32V-NEXT:    mv s10, s7
-; RV32V-NEXT:    slli s7, s7, 1
-; RV32V-NEXT:    add s10, s10, s7
-; RV32V-NEXT:    slli s7, s7, 1
-; RV32V-NEXT:    add s10, s10, s7
-; RV32V-NEXT:    slli s7, s7, 2
-; RV32V-NEXT:    add s7, s7, s10
-; RV32V-NEXT:    add s7, sp, s7
-; RV32V-NEXT:    addi s7, s7, 288
-; RV32V-NEXT:    vs4r.v v8, (s7) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v20, (s8), zero
-; RV32V-NEXT:    vlse64.v v24, (ra), zero
-; RV32V-NEXT:    vlse64.v v28, (s2), zero
-; RV32V-NEXT:    vlse64.v v4, (s1), zero
-; RV32V-NEXT:    vand.vv v8, v16, v20
-; RV32V-NEXT:    csrr s1, vlenb
-; RV32V-NEXT:    slli s1, s1, 2
-; RV32V-NEXT:    mv s2, s1
-; RV32V-NEXT:    slli s1, s1, 1
-; RV32V-NEXT:    add s1, s1, s2
-; RV32V-NEXT:    add s1, sp, s1
-; RV32V-NEXT:    addi s1, s1, 288
-; RV32V-NEXT:    vs4r.v v8, (s1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v24
-; RV32V-NEXT:    csrr s1, vlenb
-; RV32V-NEXT:    slli s1, s1, 3
-; RV32V-NEXT:    mv s2, s1
-; RV32V-NEXT:    slli s1, s1, 2
-; RV32V-NEXT:    add s1, s1, s2
-; RV32V-NEXT:    add s1, sp, s1
-; RV32V-NEXT:    addi s1, s1, 288
-; RV32V-NEXT:    vs4r.v v8, (s1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v28
-; RV32V-NEXT:    csrr s1, vlenb
-; RV32V-NEXT:    slli s1, s1, 6
-; RV32V-NEXT:    add s1, sp, s1
-; RV32V-NEXT:    addi s1, s1, 288
-; RV32V-NEXT:    vs4r.v v8, (s1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v4
-; RV32V-NEXT:    csrr s1, vlenb
-; RV32V-NEXT:    slli s1, s1, 3
-; RV32V-NEXT:    mv s2, s1
-; RV32V-NEXT:    slli s1, s1, 1
-; RV32V-NEXT:    add s2, s2, s1
-; RV32V-NEXT:    slli s1, s1, 2
-; RV32V-NEXT:    add s1, s1, s2
-; RV32V-NEXT:    add s1, sp, s1
-; RV32V-NEXT:    addi s1, s1, 288
-; RV32V-NEXT:    vs4r.v v8, (s1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s0), zero
-; RV32V-NEXT:    vlse64.v v28, (t6), zero
-; RV32V-NEXT:    vlse64.v v4, (t5), zero
-; RV32V-NEXT:    vlse64.v v0, (t4), zero
-; RV32V-NEXT:    vand.vv v8, v16, v24
-; RV32V-NEXT:    csrr t4, vlenb
-; RV32V-NEXT:    slli t4, t4, 3
-; RV32V-NEXT:    add t4, sp, t4
-; RV32V-NEXT:    addi t4, t4, 288
-; RV32V-NEXT:    vs4r.v v8, (t4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v28
-; RV32V-NEXT:    csrr t4, vlenb
-; RV32V-NEXT:    slli t4, t4, 2
-; RV32V-NEXT:    mv t5, t4
-; RV32V-NEXT:    slli t4, t4, 3
-; RV32V-NEXT:    add t4, t4, t5
-; RV32V-NEXT:    add t4, sp, t4
-; RV32V-NEXT:    addi t4, t4, 288
-; RV32V-NEXT:    vs4r.v v8, (t4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v4
-; RV32V-NEXT:    csrr t4, vlenb
-; RV32V-NEXT:    slli t4, t4, 2
-; RV32V-NEXT:    mv t5, t4
-; RV32V-NEXT:    slli t4, t4, 1
-; RV32V-NEXT:    add t5, t5, t4
-; RV32V-NEXT:    slli t4, t4, 1
-; RV32V-NEXT:    add t5, t5, t4
-; RV32V-NEXT:    slli t4, t4, 1
-; RV32V-NEXT:    add t4, t4, t5
-; RV32V-NEXT:    add t4, sp, t4
-; RV32V-NEXT:    addi t4, t4, 288
-; RV32V-NEXT:    vs4r.v v8, (t4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v0
-; RV32V-NEXT:    csrr t4, vlenb
-; RV32V-NEXT:    slli t4, t4, 2
-; RV32V-NEXT:    mv t5, t4
-; RV32V-NEXT:    slli t4, t4, 2
-; RV32V-NEXT:    add t5, t5, t4
-; RV32V-NEXT:    slli t4, t4, 2
-; RV32V-NEXT:    add t4, t4, t5
-; RV32V-NEXT:    add t4, sp, t4
-; RV32V-NEXT:    addi t4, t4, 288
-; RV32V-NEXT:    vs4r.v v8, (t4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v28, (t3), zero
-; RV32V-NEXT:    vlse64.v v4, (t2), zero
-; RV32V-NEXT:    vlse64.v v0, (t1), zero
-; RV32V-NEXT:    vlse64.v v8, (t0), zero
-; RV32V-NEXT:    vand.vv v20, v16, v28
-; RV32V-NEXT:    csrr t0, vlenb
-; RV32V-NEXT:    slli t0, t0, 2
-; RV32V-NEXT:    add t0, sp, t0
-; RV32V-NEXT:    addi t0, t0, 288
-; RV32V-NEXT:    vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v20, v16, v4
-; RV32V-NEXT:    csrr t0, vlenb
-; RV32V-NEXT:    slli t0, t0, 5
-; RV32V-NEXT:    add t0, sp, t0
-; RV32V-NEXT:    addi t0, t0, 288
-; RV32V-NEXT:    vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v20, v16, v0
-; RV32V-NEXT:    csrr t0, vlenb
-; RV32V-NEXT:    slli t0, t0, 3
-; RV32V-NEXT:    mv t1, t0
-; RV32V-NEXT:    slli t0, t0, 1
-; RV32V-NEXT:    add t1, t1, t0
-; RV32V-NEXT:    slli t0, t0, 1
-; RV32V-NEXT:    add t0, t0, t1
-; RV32V-NEXT:    add t0, sp, t0
-; RV32V-NEXT:    addi t0, t0, 288
-; RV32V-NEXT:    vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v8
-; RV32V-NEXT:    csrr t0, vlenb
-; RV32V-NEXT:    slli t0, t0, 4
-; RV32V-NEXT:    mv t1, t0
-; RV32V-NEXT:    slli t0, t0, 2
-; RV32V-NEXT:    add t0, t0, t1
-; RV32V-NEXT:    add t0, sp, t0
-; RV32V-NEXT:    addi t0, t0, 288
-; RV32V-NEXT:    vs4r.v v8, (t0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (a7), zero
-; RV32V-NEXT:    vlse64.v v0, (a6), zero
-; RV32V-NEXT:    vlse64.v v20, (a5), zero
-; RV32V-NEXT:    vlse64.v v24, (a4), zero
-; RV32V-NEXT:    vand.vv v4, v16, v8
-; RV32V-NEXT:    vand.vv v8, v16, v0
-; RV32V-NEXT:    csrr a4, vlenb
-; RV32V-NEXT:    slli a4, a4, 2
-; RV32V-NEXT:    mv a5, a4
-; RV32V-NEXT:    slli a4, a4, 1
-; RV32V-NEXT:    add a5, a5, a4
-; RV32V-NEXT:    slli a4, a4, 1
-; RV32V-NEXT:    add a4, a4, a5
-; RV32V-NEXT:    add a4, sp, a4
-; RV32V-NEXT:    addi a4, a4, 288
-; RV32V-NEXT:    vs4r.v v8, (a4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v20
-; RV32V-NEXT:    csrr a4, vlenb
-; RV32V-NEXT:    slli a4, a4, 2
-; RV32V-NEXT:    mv a5, a4
-; RV32V-NEXT:    slli a4, a4, 2
-; RV32V-NEXT:    add a5, a5, a4
-; RV32V-NEXT:    slli a4, a4, 1
-; RV32V-NEXT:    add a4, a4, a5
-; RV32V-NEXT:    add a4, sp, a4
-; RV32V-NEXT:    addi a4, a4, 288
-; RV32V-NEXT:    vs4r.v v8, (a4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v24
-; RV32V-NEXT:    csrr a4, vlenb
-; RV32V-NEXT:    slli a4, a4, 2
-; RV32V-NEXT:    mv a5, a4
-; RV32V-NEXT:    slli a4, a4, 1
-; RV32V-NEXT:    add a5, a5, a4
-; RV32V-NEXT:    slli a4, a4, 3
-; RV32V-NEXT:    add a4, a4, a5
-; RV32V-NEXT:    add a4, sp, a4
-; RV32V-NEXT:    addi a4, a4, 288
-; RV32V-NEXT:    vs4r.v v8, (a4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (a3), zero
-; RV32V-NEXT:    vlse64.v v20, (s3), zero
-; RV32V-NEXT:    vlse64.v v24, (s4), zero
-; RV32V-NEXT:    vlse64.v v28, (s5), zero
-; RV32V-NEXT:    vand.vv v0, v16, v8
-; RV32V-NEXT:    vand.vv v8, v16, v20
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    mv a4, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a3, a3, a4
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs4r.v v8, (a3) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v24
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 4
-; RV32V-NEXT:    mv a4, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a3, a3, a4
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs4r.v v8, (a3) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v28
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    mv a4, a3
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    add a3, a3, a4
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs4r.v v8, (a3) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s6), zero
-; RV32V-NEXT:    vlse64.v v20, (s9), zero
-; RV32V-NEXT:    vlse64.v v24, (a1), zero
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    vand.vv v8, v16, v8
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v16, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v16, v24
-; RV32V-NEXT:    vand.vv v20, v16, v28
-; RV32V-NEXT:    vand.vx v8, v16, a2
-; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vx v24, v16, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vlse64.v v20, (a2), zero
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vand.vv v20, v16, v20
+; RV32V-NEXT:    vand.vx v16, v16, a0
 ; RV32V-NEXT:    vmul.vv v16, v12, v16
 ; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v12, v12, v20
+; RV32V-NEXT:    vxor.vv v12, v8, v12
+; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v12, a1
+; RV32V-NEXT:    addi sp, sp, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv4i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    vzext.vf2 v16, v10
+; RV64V-NEXT:    li a1, 16
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    li a2, 64
+; RV64V-NEXT:    vand.vi v8, v16, 2
+; RV64V-NEXT:    vand.vi v20, v16, 1
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v20, v8
+; RV64V-NEXT:    vand.vi v20, v16, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vi v20, v16, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    li a1, 128
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    li a2, 256
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    li a1, 512
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    li a2, 1024
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    li a1, 1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    slli a2, a1, 11
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 16
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 32
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 64
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 128
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 256
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 512
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 1024
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 2048
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 4096
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 8192
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 16384
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 32768
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 65536
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 131072
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    slli a1, a1, 31
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vmul.vv v12, v12, v16
+; RV64V-NEXT:    vxor.vv v12, v8, v12
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v12, a0
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v12, v10
+; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC-NEXT:    li a0, 32
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v12, v10
+; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC-NEXT:    li a0, 32
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
+  %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
+  %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 4 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 4 x i64> %res.ext to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @clmulh_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv4i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v16, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vzext.vf2 v8, v16
+; RV32V-NEXT:    vand.vx v16, v8, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vand.vi v20, v8, 2
+; RV32V-NEXT:    vand.vi v24, v8, 1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v24, v20
+; RV32V-NEXT:    vand.vi v24, v8, 4
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vi v24, v8, 8
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    li a2, 128
 ; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v16, v12, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v16, v12, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v16, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vx v20, v8, a1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    li a2, 256
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    li a2, 512
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vmul.vv v24, v12, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vlse64.v v24, (a2), zero
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vand.vv v20, v8, v24
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v16, v8
 ; RV32V-NEXT:    vmul.vv v12, v12, v20
 ; RV32V-NEXT:    vxor.vv v12, v8, v12
 ; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vnsrl.wx v8, v12, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    vnsrl.wx v8, v12, a1
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv4i32_vv:
+; RV64V-LABEL: clmulh_nxv4i32_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v16, a0
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; RV64V-NEXT:    vzext.vf2 v12, v8
-; RV64V-NEXT:    vzext.vf2 v16, v10
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vand.vi v8, v16, 2
-; RV64V-NEXT:    vand.vi v20, v16, 1
-; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vzext.vf2 v8, v16
+; RV64V-NEXT:    vand.vi v16, v8, 2
+; RV64V-NEXT:    vand.vi v20, v8, 1
+; RV64V-NEXT:    vmul.vv v16, v12, v16
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v20, v8
-; RV64V-NEXT:    vand.vi v20, v16, 4
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v8, 4
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vi v20, v16, 8
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v8, 8
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a1
 ; RV64V-NEXT:    li a1, 128
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    li a2, 256
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a1
 ; RV64V-NEXT:    li a1, 512
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    li a2, 1024
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a1
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a1
 ; RV64V-NEXT:    li a1, 1
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    slli a2, a1, 11
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 1
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 2
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 4
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 8
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 16
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 32
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 64
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 128
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 256
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 512
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 1024
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 2048
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 4096
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 8192
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 16384
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 32768
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 65536
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 131072
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
 ; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vand.vx v20, v16, a2
 ; RV64V-NEXT:    slli a1, a1, 31
 ; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vand.vx v16, v16, a1
-; RV64V-NEXT:    vxor.vv v8, v8, v20
-; RV64V-NEXT:    vmul.vv v12, v12, v16
-; RV64V-NEXT:    vxor.vv v12, v8, v12
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vxor.vv v12, v16, v8
 ; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64V-NEXT:    vnsrl.wx v8, v12, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i32_vv:
+; RV32ZVBC-LABEL: clmulh_nxv4i32_vx:
 ; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v10
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC-NEXT:    vclmul.vv v12, v12, v8
 ; RV32ZVBC-NEXT:    li a0, 32
 ; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVBC-NEXT:    vnsrl.wx v8, v12, a0
 ; RV32ZVBC-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i32_vv:
+; RV64ZVBC-LABEL: clmulh_nxv4i32_vx:
 ; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v10
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC-NEXT:    vclmul.vv v12, v12, v8
 ; RV64ZVBC-NEXT:    li a0, 32
 ; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64ZVBC-NEXT:    vnsrl.wx v8, v12, a0
 ; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
   %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
   %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
@@ -7446,2716 +4904,652 @@ define <vscale x 4 x i32> @clmulh_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 4 x i32> @clmulh_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv4i32_vx:
+define <vscale x 8 x i32> @clmulh_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv8i32_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    sub sp, sp, a1
-; RV32V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vmv.v.x v16, a0
-; RV32V-NEXT:    lui s11, 524288
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    li a3, 2
-; RV32V-NEXT:    li a2, 4
-; RV32V-NEXT:    li a1, 8
-; RV32V-NEXT:    li s9, 16
-; RV32V-NEXT:    li ra, 32
-; RV32V-NEXT:    li s10, 64
-; RV32V-NEXT:    li s8, 128
-; RV32V-NEXT:    li s7, 256
-; RV32V-NEXT:    li s6, 512
-; RV32V-NEXT:    li s5, 1024
-; RV32V-NEXT:    lui s4, 1
-; RV32V-NEXT:    lui s3, 2
-; RV32V-NEXT:    lui s2, 4
-; RV32V-NEXT:    lui s1, 8
-; RV32V-NEXT:    lui s0, 16
-; RV32V-NEXT:    lui t6, 32
-; RV32V-NEXT:    lui t5, 64
-; RV32V-NEXT:    lui t4, 128
-; RV32V-NEXT:    lui t3, 256
-; RV32V-NEXT:    lui t2, 512
-; RV32V-NEXT:    lui t1, 1024
-; RV32V-NEXT:    lui t0, 2048
-; RV32V-NEXT:    lui a7, 4096
-; RV32V-NEXT:    lui a6, 8192
-; RV32V-NEXT:    lui a5, 16384
-; RV32V-NEXT:    lui a4, 32768
-; RV32V-NEXT:    sw s11, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw a0, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a3, 260(sp)
-; RV32V-NEXT:    lui a3, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a2, 252(sp)
-; RV32V-NEXT:    lui a2, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a1, 244(sp)
-; RV32V-NEXT:    lui a1, 262144
-; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vzext.vf2 v12, v8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s9, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw ra, 228(sp)
-; RV32V-NEXT:    li ra, 32
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s10, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s8, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s7, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s6, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s5, 188(sp)
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a0, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s4, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s3, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s2, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s0, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t6, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t5, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a4, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a3, 44(sp)
-; RV32V-NEXT:    lui a4, 65536
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a2, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a1, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw s11, 20(sp)
-; RV32V-NEXT:    addi s11, sp, 272
-; RV32V-NEXT:    vzext.vf2 v8, v16
-; RV32V-NEXT:    vand.vx v28, v8, s9
-; RV32V-NEXT:    addi s9, sp, 264
-; RV32V-NEXT:    vand.vx v24, v8, s10
-; RV32V-NEXT:    vand.vx v20, v8, s8
-; RV32V-NEXT:    addi s8, sp, 248
-; RV32V-NEXT:    vand.vx v16, v8, s7
-; RV32V-NEXT:    addi s7, sp, 240
-; RV32V-NEXT:    vand.vi v4, v8, 2
-; RV32V-NEXT:    vand.vi v0, v8, 1
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
-; RV32V-NEXT:    vxor.vv v4, v0, v4
-; RV32V-NEXT:    vand.vi v0, v8, 4
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v4, v4, v0
-; RV32V-NEXT:    vand.vi v0, v8, 8
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v0, v4, v0
-; RV32V-NEXT:    vand.vx v4, v8, s6
-; RV32V-NEXT:    addi s6, sp, 232
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v28, v0, v28
-; RV32V-NEXT:    vand.vx v0, v8, ra
-; RV32V-NEXT:    vmul.vv v0, v12, v0
-; RV32V-NEXT:    vxor.vv v0, v28, v0
-; RV32V-NEXT:    vand.vx v28, v8, s5
-; RV32V-NEXT:    addi s5, sp, 224
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    addi s10, sp, 216
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v0, v0, v20
-; RV32V-NEXT:    vand.vx v20, v8, s4
-; RV32V-NEXT:    addi s4, sp, 208
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v0, v0, v16
-; RV32V-NEXT:    vand.vx v16, v8, s3
-; RV32V-NEXT:    addi s3, sp, 200
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v0, v0, v4
-; RV32V-NEXT:    vand.vx v4, v8, s2
-; RV32V-NEXT:    addi s2, sp, 192
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v0, v0, v28
-; RV32V-NEXT:    vand.vx v28, v8, s1
-; RV32V-NEXT:    addi a1, sp, 184
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vand.vx v24, v8, s0
-; RV32V-NEXT:    addi s0, sp, 176
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v0, v0, v20
-; RV32V-NEXT:    vand.vx v20, v8, t6
-; RV32V-NEXT:    addi t6, sp, 168
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v0, v0, v16
-; RV32V-NEXT:    vand.vx v16, v8, t5
-; RV32V-NEXT:    addi t5, sp, 160
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v0, v0, v4
-; RV32V-NEXT:    vand.vx v4, v8, t4
-; RV32V-NEXT:    addi t4, sp, 152
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v0, v0, v28
-; RV32V-NEXT:    vand.vx v28, v8, t3
-; RV32V-NEXT:    addi t3, sp, 144
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vand.vx v24, v8, t2
-; RV32V-NEXT:    addi t2, sp, 136
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v0, v0, v20
-; RV32V-NEXT:    vand.vx v20, v8, t1
-; RV32V-NEXT:    addi t1, sp, 128
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v0, v0, v16
-; RV32V-NEXT:    vand.vx v16, v8, t0
-; RV32V-NEXT:    addi t0, sp, 120
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v0, v0, v4
-; RV32V-NEXT:    vand.vx v4, v8, a7
-; RV32V-NEXT:    addi a7, sp, 112
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v0, v0, v28
-; RV32V-NEXT:    vand.vx v28, v8, a6
-; RV32V-NEXT:    addi a6, sp, 104
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vand.vx v24, v8, a5
-; RV32V-NEXT:    addi a5, sp, 96
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v0, v0, v20
-; RV32V-NEXT:    lui a0, 32768
-; RV32V-NEXT:    vand.vx v20, v8, a0
-; RV32V-NEXT:    addi a3, sp, 88
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v0, v0, v16
-; RV32V-NEXT:    vand.vx v16, v8, a4
-; RV32V-NEXT:    addi a4, sp, 80
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v0, v0, v4
-; RV32V-NEXT:    vand.vx v4, v8, a2
-; RV32V-NEXT:    addi a2, sp, 72
-; RV32V-NEXT:    vmul.vv v28, v12, v28
-; RV32V-NEXT:    vxor.vv v0, v0, v28
-; RV32V-NEXT:    vlse64.v v28, (s11), zero
-; RV32V-NEXT:    addi s1, sp, 64
-; RV32V-NEXT:    vmul.vv v24, v12, v24
-; RV32V-NEXT:    vxor.vv v0, v0, v24
-; RV32V-NEXT:    vlse64.v v24, (s9), zero
-; RV32V-NEXT:    addi s11, sp, 56
-; RV32V-NEXT:    vmul.vv v20, v12, v20
-; RV32V-NEXT:    vxor.vv v0, v0, v20
-; RV32V-NEXT:    addi a0, sp, 256
-; RV32V-NEXT:    vlse64.v v20, (a0), zero
-; RV32V-NEXT:    addi s9, sp, 48
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v0, v0, v16
-; RV32V-NEXT:    vlse64.v v16, (s8), zero
-; RV32V-NEXT:    addi s8, sp, 40
-; RV32V-NEXT:    vmul.vv v4, v12, v4
-; RV32V-NEXT:    vxor.vv v4, v0, v4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv ra, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add ra, ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add ra, ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, ra
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v4, (s7), zero
-; RV32V-NEXT:    addi s7, sp, 32
-; RV32V-NEXT:    vand.vv v28, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, ra
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv ra, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add ra, ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, ra
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v20, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv ra, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add ra, ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, ra
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v20, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add ra, ra, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add ra, ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, ra
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v4
+; RV32V-NEXT:    addi sp, sp, -16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add ra, ra, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, ra
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi ra, sp, 24
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vzext.vf2 v16, v8
+; RV32V-NEXT:    vzext.vf2 v24, v12
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vand.vi v8, v24, 2
+; RV32V-NEXT:    vand.vi v0, v24, 1
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v0, v8
+; RV32V-NEXT:    vand.vi v0, v24, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vi v0, v24, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 2
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 4
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 8
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 16
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 64
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 256
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 512
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 1024
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 16384
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 32768
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 65536
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    lui a2, 131072
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a3, vlenb
+; RV32V-NEXT:    slli a3, a3, 3
+; RV32V-NEXT:    add a3, sp, a3
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v0, v24, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vlse64.v v8, (a2), zero
 ; RV32V-NEXT:    addi a0, sp, 16
-; RV32V-NEXT:    vlse64.v v16, (s6), zero
-; RV32V-NEXT:    vlse64.v v20, (s5), zero
-; RV32V-NEXT:    vlse64.v v24, (s10), zero
-; RV32V-NEXT:    vlse64.v v28, (s4), zero
-; RV32V-NEXT:    vand.vv v16, v8, v16
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 4
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v20
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    mv s5, s4
-; RV32V-NEXT:    slli s4, s4, 1
-; RV32V-NEXT:    add s5, s5, s4
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    add s4, s4, s5
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v24
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    mv s5, s4
-; RV32V-NEXT:    slli s4, s4, 4
-; RV32V-NEXT:    add s4, s4, s5
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v28
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    mv s5, s4
-; RV32V-NEXT:    slli s4, s4, 1
-; RV32V-NEXT:    add s5, s5, s4
-; RV32V-NEXT:    slli s4, s4, 1
-; RV32V-NEXT:    add s5, s5, s4
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    add s4, s4, s5
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v20, (s3), zero
-; RV32V-NEXT:    vlse64.v v24, (s2), zero
-; RV32V-NEXT:    vlse64.v v28, (a1), zero
-; RV32V-NEXT:    vlse64.v v4, (s0), zero
-; RV32V-NEXT:    vand.vv v16, v8, v20
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv s0, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, s0
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv s0, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, s0
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v28
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v4
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv s0, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add s0, s0, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, s0
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t6), zero
-; RV32V-NEXT:    vlse64.v v28, (t5), zero
-; RV32V-NEXT:    vlse64.v v4, (t4), zero
-; RV32V-NEXT:    vlse64.v v0, (t3), zero
-; RV32V-NEXT:    vand.vv v16, v8, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v28
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv t3, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, t3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v4
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv t3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add t3, t3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add t3, t3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, t3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v0
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv t3, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add t3, t3, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, t3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v28, (t2), zero
-; RV32V-NEXT:    vlse64.v v4, (t1), zero
-; RV32V-NEXT:    vlse64.v v0, (t0), zero
-; RV32V-NEXT:    vlse64.v v16, (a7), zero
-; RV32V-NEXT:    vand.vv v20, v8, v28
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v20, v8, v4
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 5
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v20, v8, v0
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a7, a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a7
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v16
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    mv a7, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, a7
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v16, (a6), zero
-; RV32V-NEXT:    vlse64.v v0, (a5), zero
-; RV32V-NEXT:    vlse64.v v20, (a3), zero
-; RV32V-NEXT:    vlse64.v v24, (a4), zero
-; RV32V-NEXT:    vand.vv v4, v8, v16
-; RV32V-NEXT:    vand.vv v16, v8, v0
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv a3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a3, a3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v20
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv a3, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a3, a3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    mv a3, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a3, a3, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, a3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v20, (s1), zero
-; RV32V-NEXT:    vlse64.v v24, (s11), zero
-; RV32V-NEXT:    vlse64.v v28, (s9), zero
-; RV32V-NEXT:    vand.vv v0, v8, v16
-; RV32V-NEXT:    vand.vv v16, v8, v20
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v24
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v28
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v16, (s8), zero
-; RV32V-NEXT:    vlse64.v v20, (s7), zero
-; RV32V-NEXT:    vlse64.v v24, (ra), zero
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    vand.vv v16, v8, v16
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v16, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    vand.vv v20, v8, v28
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v8, v12, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v0, v16, v0
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    addi a0, a0, 16
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v8, v0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v24, v8
+; RV32V-NEXT:    vand.vx v24, v24, a0
+; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vxor.vv v16, v24, v8
+; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v16, a1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v16, v12, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v16, v12, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v12, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v16, v12, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vmul.vv v12, v12, v20
-; RV32V-NEXT:    vxor.vv v12, v8, v12
-; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vnsrl.wx v8, v12, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv4i32_vx:
+; RV64V-LABEL: clmulh_nxv8i32_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vmv.v.x v16, a0
-; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vzext.vf2 v16, v8
+; RV64V-NEXT:    vzext.vf2 v24, v12
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vzext.vf2 v8, v16
-; RV64V-NEXT:    vand.vi v16, v8, 2
-; RV64V-NEXT:    vand.vi v20, v8, 1
-; RV64V-NEXT:    vmul.vv v16, v12, v16
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v20, v16
-; RV64V-NEXT:    vand.vi v20, v8, 4
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vi v20, v8, 8
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    vand.vi v8, v24, 2
+; RV64V-NEXT:    vand.vi v0, v24, 1
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v0, v8
+; RV64V-NEXT:    vand.vi v0, v24, 4
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vi v0, v24, 8
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a1
 ; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a0
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a1
 ; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a1
 ; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    lui a2, 262144
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v20, v8, a2
-; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vmul.vv v20, v12, v20
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vv v8, v12, v8
-; RV64V-NEXT:    vxor.vv v12, v16, v8
-; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v12, a0
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    vmul.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vnsrl.wx v8, v16, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv4i32_vx:
+; RV32ZVBC-LABEL: clmulh_nxv8i32_vv:
 ; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v16, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v16
-; RV32ZVBC-NEXT:    vclmul.vv v12, v12, v8
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v16, v12
+; RV32ZVBC-NEXT:    vzext.vf2 v24, v8
+; RV32ZVBC-NEXT:    vclmul.vv v16, v24, v16
 ; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v12, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wx v8, v16, a0
 ; RV32ZVBC-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv4i32_vx:
+; RV64ZVBC-LABEL: clmulh_nxv8i32_vv:
 ; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v16, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v16
-; RV64ZVBC-NEXT:    vclmul.vv v12, v12, v8
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v16, v12
+; RV64ZVBC-NEXT:    vzext.vf2 v24, v8
+; RV64ZVBC-NEXT:    vclmul.vv v16, v24, v16
 ; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v12, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wx v8, v16, a0
 ; RV64ZVBC-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
-  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %va.ext = zext <vscale x 4 x i32> %va to <vscale x 4 x i64>
-  %vb.ext = zext <vscale x 4 x i32> %vb to <vscale x 4 x i64>
-  %clmul = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %va.ext, <vscale x 4 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 4 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 4 x i64> %res.ext to <vscale x 4 x i32>
-  ret <vscale x 4 x i32> %res
+  %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
+  %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
+  %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 8 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 8 x i64> %res.ext to <vscale x 8 x i32>
+  ret <vscale x 8 x i32> %res
 }
 
-define <vscale x 8 x i32> @clmulh_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv8i32_vv:
+define <vscale x 8 x i32> @clmulh_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv8i32_vx:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    lui a2, 524288
-; RV32V-NEXT:    li s4, 1
-; RV32V-NEXT:    li a4, 2
-; RV32V-NEXT:    li a3, 4
-; RV32V-NEXT:    li a1, 8
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    li s1, 64
-; RV32V-NEXT:    li s3, 128
-; RV32V-NEXT:    li s7, 256
-; RV32V-NEXT:    li s10, 512
-; RV32V-NEXT:    li s6, 1024
-; RV32V-NEXT:    lui ra, 1
-; RV32V-NEXT:    lui s11, 2
-; RV32V-NEXT:    lui s8, 4
-; RV32V-NEXT:    lui s5, 8
-; RV32V-NEXT:    lui s2, 16
-; RV32V-NEXT:    lui s0, 32
-; RV32V-NEXT:    lui t6, 64
-; RV32V-NEXT:    lui t5, 128
-; RV32V-NEXT:    lui t4, 256
-; RV32V-NEXT:    lui t3, 512
-; RV32V-NEXT:    lui t2, 1024
-; RV32V-NEXT:    lui t1, 2048
-; RV32V-NEXT:    lui t0, 4096
-; RV32V-NEXT:    lui a7, 8192
-; RV32V-NEXT:    lui a6, 16384
-; RV32V-NEXT:    lui a5, 32768
-; RV32V-NEXT:    sw a2, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s4, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a4, 260(sp)
-; RV32V-NEXT:    lui a4, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a3, 252(sp)
-; RV32V-NEXT:    lui a3, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a1, 244(sp)
-; RV32V-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32V-NEXT:    addi sp, sp, -16
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 4
+; RV32V-NEXT:    sub sp, sp, a1
+; RV32V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v24, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32V-NEXT:    vzext.vf2 v16, v8
-; RV32V-NEXT:    vzext.vf2 v24, v12
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw a0, 236(sp)
-; RV32V-NEXT:    vand.vi v8, v24, 2
-; RV32V-NEXT:    vand.vi v0, v24, 1
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    li a2, 16
+; RV32V-NEXT:    li a1, 32
+; RV32V-NEXT:    vzext.vf2 v8, v24
+; RV32V-NEXT:    vand.vi v24, v8, 2
+; RV32V-NEXT:    vand.vi v0, v8, 1
+; RV32V-NEXT:    vmul.vv v24, v16, v24
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
-; RV32V-NEXT:    vxor.vv v8, v0, v8
-; RV32V-NEXT:    vand.vi v0, v24, 4
+; RV32V-NEXT:    vxor.vv v24, v0, v24
+; RV32V-NEXT:    vand.vi v0, v8, 4
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vi v0, v24, 8
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vi v0, v8, 8
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a0
-; RV32V-NEXT:    addi s9, sp, 272
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    sw a0, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s1, 220(sp)
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a0
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 64
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s1
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s3, 212(sp)
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a1
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s3
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s7, 204(sp)
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 128
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s7
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s10, 196(sp)
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 256
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s10
-; RV32V-NEXT:    addi s10, sp, 240
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s6, 188(sp)
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 512
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s6
-; RV32V-NEXT:    slli s4, s4, 11
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s4, 180(sp)
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 1024
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s4
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    li a2, 1
+; RV32V-NEXT:    slli a2, a2, 11
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, ra
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw ra, 172(sp)
-; RV32V-NEXT:    addi ra, sp, 216
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 1
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s11
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s11, 164(sp)
-; RV32V-NEXT:    addi s11, sp, 208
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 2
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s8
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s8, 156(sp)
-; RV32V-NEXT:    addi s8, sp, 200
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 4
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s5
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s5, 148(sp)
-; RV32V-NEXT:    addi s5, sp, 192
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 8
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s2
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s2, 140(sp)
-; RV32V-NEXT:    addi s4, sp, 184
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 16
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, s0
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
-; RV32V-NEXT:    addi s2, sp, 176
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 32
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, t6
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t6, 124(sp)
-; RV32V-NEXT:    addi s3, sp, 168
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 64
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, t5
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
-; RV32V-NEXT:    addi s1, sp, 160
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 128
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, t4
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t4, 108(sp)
-; RV32V-NEXT:    addi s0, sp, 152
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 256
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, t3
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t3, 100(sp)
-; RV32V-NEXT:    addi t5, sp, 144
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 512
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, t2
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t2, 92(sp)
-; RV32V-NEXT:    addi t6, sp, 136
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 1024
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, t1
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t1, 84(sp)
-; RV32V-NEXT:    addi t4, sp, 128
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 2048
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, t0
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw t0, 76(sp)
-; RV32V-NEXT:    addi t3, sp, 120
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 4096
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a7
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a7, 68(sp)
-; RV32V-NEXT:    addi t1, sp, 112
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 8192
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a6
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a6, 60(sp)
-; RV32V-NEXT:    addi t2, sp, 104
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 16384
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a5
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a5, 52(sp)
-; RV32V-NEXT:    addi t0, sp, 96
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 32768
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a4
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a4, 44(sp)
-; RV32V-NEXT:    addi a7, sp, 88
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 65536
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vand.vx v0, v24, a3
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a3, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a2, 20(sp)
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    lui a2, 131072
 ; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    sw t1, 4(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 5
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s9), zero
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    addi a6, sp, 72
-; RV32V-NEXT:    addi a5, sp, 64
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 8
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a2, sp, 56
-; RV32V-NEXT:    addi a1, sp, 48
-; RV32V-NEXT:    addi s9, sp, 40
-; RV32V-NEXT:    addi a4, sp, 32
-; RV32V-NEXT:    addi s7, sp, 264
-; RV32V-NEXT:    vlse64.v v8, (s7), zero
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 4
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 256
-; RV32V-NEXT:    vlse64.v v0, (s7), zero
-; RV32V-NEXT:    addi s6, sp, 248
-; RV32V-NEXT:    vlse64.v v8, (s6), zero
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s10), zero
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 4
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 5
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 4
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s7, sp, 24
-; RV32V-NEXT:    addi s10, sp, 16
-; RV32V-NEXT:    addi s6, sp, 232
-; RV32V-NEXT:    vlse64.v v8, (s6), zero
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 4
-; RV32V-NEXT:    mv t1, s6
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add t1, t1, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, t1
-; RV32V-NEXT:    lw t1, 4(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs8r.v v8, (s6) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s6, sp, 224
-; RV32V-NEXT:    vlse64.v v0, (s6), zero
-; RV32V-NEXT:    vlse64.v v8, (ra), zero
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    mv ra, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add ra, ra, s6
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add ra, ra, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, ra
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs8r.v v8, (s6) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s11), zero
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs8r.v v8, (s6) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 4
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vl8r.v v8, (s6) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs8r.v v8, (s6) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 4
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs8r.v v8, (s6) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vl8r.v v8, (s6) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs8r.v v8, (s6) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vl8r.v v8, (s6) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr s6, vlenb
-; RV32V-NEXT:    slli s6, s6, 3
-; RV32V-NEXT:    mv s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 2
-; RV32V-NEXT:    add s11, s11, s6
-; RV32V-NEXT:    slli s6, s6, 1
-; RV32V-NEXT:    add s6, s6, s11
-; RV32V-NEXT:    add s6, sp, s6
-; RV32V-NEXT:    addi s6, s6, 288
-; RV32V-NEXT:    vs8r.v v8, (s6) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (s8), zero
-; RV32V-NEXT:    vlse64.v v8, (s5), zero
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    mv s6, s5
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    add s6, s6, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s6
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vs8r.v v8, (s5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s4), zero
-; RV32V-NEXT:    csrr s4, vlenb
-; RV32V-NEXT:    slli s4, s4, 4
-; RV32V-NEXT:    mv s5, s4
-; RV32V-NEXT:    slli s4, s4, 2
-; RV32V-NEXT:    add s5, s5, s4
-; RV32V-NEXT:    slli s4, s4, 1
-; RV32V-NEXT:    add s4, s4, s5
-; RV32V-NEXT:    add s4, sp, s4
-; RV32V-NEXT:    addi s4, s4, 288
-; RV32V-NEXT:    vs8r.v v8, (s4) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s2), zero
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 6
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vs8r.v v8, (s2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v0, v24, v0
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 4
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vs8r.v v0, (s2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 3
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 3
-; RV32V-NEXT:    add s4, s4, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vl8r.v v8, (s2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 3
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 2
-; RV32V-NEXT:    add s4, s4, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vs8r.v v8, (s2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 4
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 2
-; RV32V-NEXT:    add s4, s4, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vl8r.v v8, (s2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 5
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 2
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vs8r.v v8, (s2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 6
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vl8r.v v8, (s2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 4
-; RV32V-NEXT:    mv s4, s2
-; RV32V-NEXT:    slli s2, s2, 2
-; RV32V-NEXT:    add s4, s4, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s4
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vs8r.v v8, (s2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s3), zero
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 6
-; RV32V-NEXT:    mv s3, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s3
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vs8r.v v8, (s2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (s1), zero
-; RV32V-NEXT:    vlse64.v v8, (s0), zero
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 3
-; RV32V-NEXT:    mv s1, s0
-; RV32V-NEXT:    slli s0, s0, 3
-; RV32V-NEXT:    add s1, s1, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s1
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs8r.v v8, (s0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (t5), zero
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s0, s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s0, s0, t5
-; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 6
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vl8r.v v8, (t5) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 5
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    add s0, s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vl8r.v v8, (t5) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s0, s0, t5
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s0, s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add s0, s0, t5
-; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vl8r.v v8, (t5) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv s0, t5
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    add s0, s0, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, s0
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (t6), zero
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv t6, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t6, t6, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t6, t6, t5
-; RV32V-NEXT:    slli t5, t5, 2
-; RV32V-NEXT:    add t5, t5, t6
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (t4), zero
-; RV32V-NEXT:    vlse64.v v8, (t3), zero
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 6
-; RV32V-NEXT:    mv t4, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t3, t3, t4
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs8r.v v8, (t3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (t1), zero
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 4
-; RV32V-NEXT:    mv t3, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t3, t3, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add t1, t1, t3
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv t3, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t3, t3, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t3, t3, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add t1, t1, t3
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 5
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    mv t3, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t3, t3, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add t1, t1, t3
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 6
-; RV32V-NEXT:    mv t3, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, t3
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 4
-; RV32V-NEXT:    mv t3, t1
-; RV32V-NEXT:    slli t1, t1, 3
-; RV32V-NEXT:    add t1, t1, t3
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 4
-; RV32V-NEXT:    mv t3, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t3, t3, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add t1, t1, t3
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 6
-; RV32V-NEXT:    mv t3, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, t3
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (t2), zero
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 4
-; RV32V-NEXT:    mv t2, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t2, t2, t1
-; RV32V-NEXT:    slli t1, t1, 2
-; RV32V-NEXT:    add t1, t1, t2
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v8, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (t0), zero
-; RV32V-NEXT:    vlse64.v v8, (a7), zero
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 3
-; RV32V-NEXT:    mv t0, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t0, t0, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t0, t0, a7
-; RV32V-NEXT:    slli a7, a7, 2
-; RV32V-NEXT:    add a7, a7, t0
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (a3), zero
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 4
-; RV32V-NEXT:    mv a7, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a7, a7, a3
-; RV32V-NEXT:    slli a3, a3, 2
-; RV32V-NEXT:    add a3, a3, a7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    mv a7, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a3, a3, a7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 4
-; RV32V-NEXT:    mv a7, a3
-; RV32V-NEXT:    slli a3, a3, 2
-; RV32V-NEXT:    add a3, a3, a7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    mv a7, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a7, a7, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a7, a7, a3
-; RV32V-NEXT:    slli a3, a3, 2
-; RV32V-NEXT:    add a3, a3, a7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    mv a7, a3
-; RV32V-NEXT:    slli a3, a3, 4
-; RV32V-NEXT:    add a3, a3, a7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
+; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a3, vlenb
 ; RV32V-NEXT:    slli a3, a3, 3
-; RV32V-NEXT:    mv a7, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a7, a7, a3
-; RV32V-NEXT:    slli a3, a3, 1
-; RV32V-NEXT:    add a7, a7, a3
-; RV32V-NEXT:    slli a3, a3, 2
-; RV32V-NEXT:    add a3, a3, a7
-; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (a6), zero
-; RV32V-NEXT:    csrr a3, vlenb
-; RV32V-NEXT:    slli a3, a3, 7
 ; RV32V-NEXT:    add a3, sp, a3
-; RV32V-NEXT:    addi a3, a3, 288
-; RV32V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a5), zero
-; RV32V-NEXT:    vlse64.v v8, (a2), zero
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 4
-; RV32V-NEXT:    mv a3, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a3, a3, a2
-; RV32V-NEXT:    slli a2, a2, 2
-; RV32V-NEXT:    add a2, a2, a3
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (a1), zero
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 7
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 7
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s9), zero
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a4), zero
-; RV32V-NEXT:    vlse64.v v8, (s7), zero
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v8, (s10), zero
-; RV32V-NEXT:    addi a1, sp, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 6
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 288
-; RV32V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v0, v24, v8
-; RV32V-NEXT:    vand.vx v8, v24, a0
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    addi a3, a3, 16
+; RV32V-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v0, v8, a2
+; RV32V-NEXT:    addi a2, sp, 8
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    vlse64.v v24, (a2), zero
+; RV32V-NEXT:    addi a0, sp, 16
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v0, v16, v0
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    addi a0, a0, 16
 ; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vxor.vv v0, v24, v0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vxor.vv v8, v0, v8
+; RV32V-NEXT:    vmul.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v8, v16
+; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vnsrl.wx v8, v16, a1
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vmul.vv v16, v16, v0
-; RV32V-NEXT:    vxor.vv v16, v8, v16
-; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vnsrl.wx v8, v16, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    addi sp, sp, 16
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv8i32_vv:
+; RV64V-LABEL: clmulh_nxv8i32_vx:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v24, a0
+; RV64V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64V-NEXT:    vzext.vf2 v16, v8
-; RV64V-NEXT:    vzext.vf2 v24, v12
 ; RV64V-NEXT:    li a1, 16
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vand.vi v8, v24, 2
-; RV64V-NEXT:    vand.vi v0, v24, 1
-; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vzext.vf2 v8, v24
+; RV64V-NEXT:    vand.vi v24, v8, 2
+; RV64V-NEXT:    vand.vi v0, v8, 1
+; RV64V-NEXT:    vmul.vv v24, v16, v24
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v0, v8
-; RV64V-NEXT:    vand.vi v0, v24, 4
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vi v0, v8, 4
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vi v0, v24, 8
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vi v0, v8, 8
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a1
 ; RV64V-NEXT:    li a1, 128
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a0
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    li a2, 256
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a1
 ; RV64V-NEXT:    li a1, 512
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    li a2, 1024
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a1
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a1
 ; RV64V-NEXT:    li a1, 1
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    slli a2, a1, 11
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 1
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 2
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 4
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 8
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 16
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 32
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 64
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 128
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 256
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 512
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 1024
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 2048
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 4096
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 8192
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 16384
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 32768
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 65536
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 131072
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
 ; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vand.vx v0, v24, a2
 ; RV64V-NEXT:    slli a1, a1, 31
 ; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vand.vx v24, v24, a1
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    vmul.vv v16, v16, v24
-; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vmul.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vv v8, v16, v8
+; RV64V-NEXT:    vxor.vv v16, v24, v8
 ; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64V-NEXT:    vnsrl.wx v8, v16, a0
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i32_vv:
+; RV32ZVBC-LABEL: clmulh_nxv8i32_vx:
 ; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v12
-; RV32ZVBC-NEXT:    vzext.vf2 v24, v8
-; RV32ZVBC-NEXT:    vclmul.vv v16, v24, v16
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC-NEXT:    vclmul.vv v16, v16, v8
 ; RV32ZVBC-NEXT:    li a0, 32
 ; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32ZVBC-NEXT:    vnsrl.wx v8, v16, a0
 ; RV32ZVBC-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i32_vv:
+; RV64ZVBC-LABEL: clmulh_nxv8i32_vx:
 ; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v12
-; RV64ZVBC-NEXT:    vzext.vf2 v24, v8
-; RV64ZVBC-NEXT:    vclmul.vv v16, v24, v16
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC-NEXT:    vclmul.vv v16, v16, v8
 ; RV64ZVBC-NEXT:    li a0, 32
 ; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64ZVBC-NEXT:    vnsrl.wx v8, v16, a0
 ; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i64 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
   %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
@@ -10164,32544 +5558,4877 @@ define <vscale x 8 x i32> @clmulh_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8
   ret <vscale x 8 x i32> %res
 }
 
-define <vscale x 8 x i32> @clmulh_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv8i32_vx:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    sub sp, sp, a1
-; RV32V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v24, a0
-; RV32V-NEXT:    lui s11, 524288
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    li a3, 2
-; RV32V-NEXT:    li a4, 4
-; RV32V-NEXT:    li a1, 8
-; RV32V-NEXT:    li s10, 16
-; RV32V-NEXT:    li ra, 32
-; RV32V-NEXT:    li s9, 64
-; RV32V-NEXT:    li s8, 128
-; RV32V-NEXT:    li s7, 256
-; RV32V-NEXT:    li s6, 512
-; RV32V-NEXT:    li s5, 1024
-; RV32V-NEXT:    lui s4, 1
-; RV32V-NEXT:    lui s3, 2
-; RV32V-NEXT:    lui s2, 4
-; RV32V-NEXT:    lui s1, 8
-; RV32V-NEXT:    lui s0, 16
-; RV32V-NEXT:    lui t6, 32
-; RV32V-NEXT:    lui t5, 64
-; RV32V-NEXT:    lui t4, 128
-; RV32V-NEXT:    lui t3, 256
-; RV32V-NEXT:    lui t2, 512
-; RV32V-NEXT:    lui t1, 1024
-; RV32V-NEXT:    lui t0, 2048
-; RV32V-NEXT:    lui a7, 4096
-; RV32V-NEXT:    lui a6, 8192
-; RV32V-NEXT:    lui a5, 16384
-; RV32V-NEXT:    lui a2, 32768
-; RV32V-NEXT:    sw s11, 272(sp)
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw a0, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a3, 260(sp)
-; RV32V-NEXT:    lui a3, 65536
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a4, 252(sp)
-; RV32V-NEXT:    lui a4, 131072
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a1, 244(sp)
-; RV32V-NEXT:    lui a1, 262144
-; RV32V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vzext.vf2 v16, v8
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s10, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw ra, 228(sp)
-; RV32V-NEXT:    li ra, 32
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s9, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s8, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s7, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s6, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s5, 188(sp)
-; RV32V-NEXT:    slli a0, a0, 11
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a0, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s4, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s3, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s2, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s0, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t6, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t5, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t4, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t3, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t2, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a7, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a5, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a2, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a3, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a4, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a1, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw s11, 20(sp)
-; RV32V-NEXT:    addi s11, sp, 272
-; RV32V-NEXT:    vzext.vf2 v8, v24
-; RV32V-NEXT:    vand.vi v24, v8, 2
-; RV32V-NEXT:    vand.vi v0, v8, 1
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vi v0, v0, 0
-; RV32V-NEXT:    vxor.vv v24, v0, v24
-; RV32V-NEXT:    vand.vi v0, v8, 4
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vi v0, v8, 8
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s10
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, ra
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s9
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s8
-; RV32V-NEXT:    addi s8, sp, 248
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s7
-; RV32V-NEXT:    addi ra, sp, 240
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s6
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s5
-; RV32V-NEXT:    addi s9, sp, 224
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a0
-; RV32V-NEXT:    addi s7, sp, 216
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s4
-; RV32V-NEXT:    addi s5, sp, 208
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s3
-; RV32V-NEXT:    addi s6, sp, 200
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s2
-; RV32V-NEXT:    addi s3, sp, 192
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s1
-; RV32V-NEXT:    addi s2, sp, 184
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, s0
-; RV32V-NEXT:    addi s0, sp, 176
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, t6
-; RV32V-NEXT:    addi s1, sp, 168
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, t5
-; RV32V-NEXT:    addi t6, sp, 160
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, t4
-; RV32V-NEXT:    addi t5, sp, 152
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, t3
-; RV32V-NEXT:    addi t3, sp, 144
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, t2
-; RV32V-NEXT:    addi t4, sp, 136
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, t1
-; RV32V-NEXT:    addi t2, sp, 128
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, t0
-; RV32V-NEXT:    addi t1, sp, 120
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a7
-; RV32V-NEXT:    addi a7, sp, 112
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a6
-; RV32V-NEXT:    addi t0, sp, 104
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a5
-; RV32V-NEXT:    addi a6, sp, 96
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a2
-; RV32V-NEXT:    addi a5, sp, 88
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a3
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    vand.vx v0, v8, a4
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    vxor.vv v24, v24, v0
-; RV32V-NEXT:    sw t2, 4(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s11), zero
-; RV32V-NEXT:    addi a2, sp, 80
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    addi a4, sp, 64
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 56
-; RV32V-NEXT:    addi a0, sp, 48
-; RV32V-NEXT:    addi s4, sp, 40
-; RV32V-NEXT:    addi s11, sp, 32
-; RV32V-NEXT:    addi s10, sp, 264
-; RV32V-NEXT:    vlse64.v v24, (s10), zero
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 4
-; RV32V-NEXT:    mv s10, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s10, s10, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s10, s10, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s10
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s10, sp, 256
-; RV32V-NEXT:    vlse64.v v0, (s10), zero
-; RV32V-NEXT:    vlse64.v v24, (s8), zero
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 3
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (ra), zero
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 3
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 2
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 4
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 5
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 3
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 2
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 3
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 4
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 3
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 2
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr t2, vlenb
-; RV32V-NEXT:    slli t2, t2, 3
-; RV32V-NEXT:    mv s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add s8, s8, t2
-; RV32V-NEXT:    slli t2, t2, 1
-; RV32V-NEXT:    add t2, t2, s8
-; RV32V-NEXT:    add t2, sp, t2
-; RV32V-NEXT:    addi t2, t2, 288
-; RV32V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi s8, sp, 24
-; RV32V-NEXT:    addi s10, sp, 16
-; RV32V-NEXT:    addi ra, sp, 232
-; RV32V-NEXT:    vlse64.v v24, (ra), zero
-; RV32V-NEXT:    csrr ra, vlenb
-; RV32V-NEXT:    slli ra, ra, 4
-; RV32V-NEXT:    mv t2, ra
-; RV32V-NEXT:    slli ra, ra, 2
-; RV32V-NEXT:    add t2, t2, ra
-; RV32V-NEXT:    slli ra, ra, 1
-; RV32V-NEXT:    add ra, ra, t2
-; RV32V-NEXT:    lw t2, 4(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    add ra, sp, ra
-; RV32V-NEXT:    addi ra, ra, 288
-; RV32V-NEXT:    vs8r.v v24, (ra) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (s9), zero
-; RV32V-NEXT:    vlse64.v v24, (s7), zero
-; RV32V-NEXT:    csrr s7, vlenb
-; RV32V-NEXT:    slli s7, s7, 3
-; RV32V-NEXT:    mv s9, s7
-; RV32V-NEXT:    slli s7, s7, 1
-; RV32V-NEXT:    add s9, s9, s7
-; RV32V-NEXT:    slli s7, s7, 2
-; RV32V-NEXT:    add s9, s9, s7
-; RV32V-NEXT:    slli s7, s7, 1
-; RV32V-NEXT:    add s7, s7, s9
-; RV32V-NEXT:    add s7, sp, s7
-; RV32V-NEXT:    addi s7, s7, 288
-; RV32V-NEXT:    vs8r.v v24, (s7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s5), zero
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 4
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 2
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 4
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 2
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 2
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 2
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr s5, vlenb
-; RV32V-NEXT:    slli s5, s5, 3
-; RV32V-NEXT:    mv s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 2
-; RV32V-NEXT:    add s7, s7, s5
-; RV32V-NEXT:    slli s5, s5, 1
-; RV32V-NEXT:    add s5, s5, s7
-; RV32V-NEXT:    add s5, sp, s5
-; RV32V-NEXT:    addi s5, s5, 288
-; RV32V-NEXT:    vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (s6), zero
-; RV32V-NEXT:    vlse64.v v24, (s3), zero
-; RV32V-NEXT:    csrr s3, vlenb
-; RV32V-NEXT:    slli s3, s3, 3
-; RV32V-NEXT:    mv s5, s3
-; RV32V-NEXT:    slli s3, s3, 3
-; RV32V-NEXT:    add s5, s5, s3
-; RV32V-NEXT:    slli s3, s3, 1
-; RV32V-NEXT:    add s3, s3, s5
-; RV32V-NEXT:    add s3, sp, s3
-; RV32V-NEXT:    addi s3, s3, 288
-; RV32V-NEXT:    vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s2), zero
-; RV32V-NEXT:    csrr s2, vlenb
-; RV32V-NEXT:    slli s2, s2, 4
-; RV32V-NEXT:    mv s3, s2
-; RV32V-NEXT:    slli s2, s2, 2
-; RV32V-NEXT:    add s3, s3, s2
-; RV32V-NEXT:    slli s2, s2, 1
-; RV32V-NEXT:    add s2, s2, s3
-; RV32V-NEXT:    add s2, sp, s2
-; RV32V-NEXT:    addi s2, s2, 288
-; RV32V-NEXT:    vs8r.v v24, (s2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s0), zero
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 6
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs8r.v v24, (s0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v0, v8, v0
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 4
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs8r.v v0, (s0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 3
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 3
-; RV32V-NEXT:    add s2, s2, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vl8r.v v24, (s0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 3
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s2, s2, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs8r.v v24, (s0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 4
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s2, s2, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vl8r.v v24, (s0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 5
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs8r.v v24, (s0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 6
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vl8r.v v24, (s0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 4
-; RV32V-NEXT:    mv s2, s0
-; RV32V-NEXT:    slli s0, s0, 2
-; RV32V-NEXT:    add s2, s2, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s2
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs8r.v v24, (s0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s1), zero
-; RV32V-NEXT:    csrr s0, vlenb
-; RV32V-NEXT:    slli s0, s0, 6
-; RV32V-NEXT:    mv s1, s0
-; RV32V-NEXT:    slli s0, s0, 1
-; RV32V-NEXT:    add s0, s0, s1
-; RV32V-NEXT:    add s0, sp, s0
-; RV32V-NEXT:    addi s0, s0, 288
-; RV32V-NEXT:    vs8r.v v24, (s0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (t6), zero
-; RV32V-NEXT:    vlse64.v v24, (t5), zero
-; RV32V-NEXT:    csrr t5, vlenb
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    mv t6, t5
-; RV32V-NEXT:    slli t5, t5, 3
-; RV32V-NEXT:    add t6, t6, t5
-; RV32V-NEXT:    slli t5, t5, 1
-; RV32V-NEXT:    add t5, t5, t6
-; RV32V-NEXT:    add t5, sp, t5
-; RV32V-NEXT:    addi t5, t5, 288
-; RV32V-NEXT:    vs8r.v v24, (t5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t3), zero
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t5, t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t5, t5, t3
-; RV32V-NEXT:    slli t3, t3, 2
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs8r.v v24, (t3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 6
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vl8r.v v24, (t3) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 2
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs8r.v v24, (t3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 5
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs8r.v v24, (t3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    add t5, t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vl8r.v v24, (t3) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t5, t5, t3
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs8r.v v24, (t3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t5, t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t5, t5, t3
-; RV32V-NEXT:    slli t3, t3, 2
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vl8r.v v24, (t3) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t5, t3
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    add t5, t5, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t3, t3, t5
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs8r.v v24, (t3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t4), zero
-; RV32V-NEXT:    csrr t3, vlenb
-; RV32V-NEXT:    slli t3, t3, 3
-; RV32V-NEXT:    mv t4, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t4, t4, t3
-; RV32V-NEXT:    slli t3, t3, 1
-; RV32V-NEXT:    add t4, t4, t3
-; RV32V-NEXT:    slli t3, t3, 2
-; RV32V-NEXT:    add t3, t3, t4
-; RV32V-NEXT:    add t3, sp, t3
-; RV32V-NEXT:    addi t3, t3, 288
-; RV32V-NEXT:    vs8r.v v24, (t3) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (t2), zero
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr t1, vlenb
-; RV32V-NEXT:    slli t1, t1, 6
-; RV32V-NEXT:    mv t2, t1
-; RV32V-NEXT:    slli t1, t1, 1
-; RV32V-NEXT:    add t1, t1, t2
-; RV32V-NEXT:    add t1, sp, t1
-; RV32V-NEXT:    addi t1, t1, 288
-; RV32V-NEXT:    vs8r.v v24, (t1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (a7), zero
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 4
-; RV32V-NEXT:    mv t1, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t1, t1, a7
-; RV32V-NEXT:    slli a7, a7, 2
-; RV32V-NEXT:    add a7, a7, t1
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 3
-; RV32V-NEXT:    mv t1, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t1, t1, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t1, t1, a7
-; RV32V-NEXT:    slli a7, a7, 2
-; RV32V-NEXT:    add a7, a7, t1
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 5
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 3
-; RV32V-NEXT:    mv t1, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t1, t1, a7
-; RV32V-NEXT:    slli a7, a7, 2
-; RV32V-NEXT:    add a7, a7, t1
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 6
-; RV32V-NEXT:    mv t1, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add a7, a7, t1
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 4
-; RV32V-NEXT:    mv t1, a7
-; RV32V-NEXT:    slli a7, a7, 3
-; RV32V-NEXT:    add a7, a7, t1
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 4
-; RV32V-NEXT:    mv t1, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t1, t1, a7
-; RV32V-NEXT:    slli a7, a7, 2
-; RV32V-NEXT:    add a7, a7, t1
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 6
-; RV32V-NEXT:    mv t1, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add a7, a7, t1
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 4
-; RV32V-NEXT:    mv t0, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t0, t0, a7
-; RV32V-NEXT:    slli a7, a7, 2
-; RV32V-NEXT:    add a7, a7, t0
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 288
-; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a6), zero
-; RV32V-NEXT:    vlse64.v v24, (a5), zero
-; RV32V-NEXT:    csrr a5, vlenb
-; RV32V-NEXT:    slli a5, a5, 3
-; RV32V-NEXT:    mv a6, a5
-; RV32V-NEXT:    slli a5, a5, 1
-; RV32V-NEXT:    add a6, a6, a5
-; RV32V-NEXT:    slli a5, a5, 1
-; RV32V-NEXT:    add a6, a6, a5
-; RV32V-NEXT:    slli a5, a5, 2
-; RV32V-NEXT:    add a5, a5, a6
-; RV32V-NEXT:    add a5, sp, a5
-; RV32V-NEXT:    addi a5, a5, 288
-; RV32V-NEXT:    vs8r.v v24, (a5) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (a2), zero
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 7
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 4
-; RV32V-NEXT:    mv a5, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a5, a5, a2
-; RV32V-NEXT:    slli a2, a2, 2
-; RV32V-NEXT:    add a2, a2, a5
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 3
-; RV32V-NEXT:    mv a5, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a2, a2, a5
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 4
-; RV32V-NEXT:    mv a5, a2
-; RV32V-NEXT:    slli a2, a2, 2
-; RV32V-NEXT:    add a2, a2, a5
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 3
-; RV32V-NEXT:    mv a5, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a5, a5, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a5, a5, a2
-; RV32V-NEXT:    slli a2, a2, 2
-; RV32V-NEXT:    add a2, a2, a5
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 3
-; RV32V-NEXT:    mv a5, a2
-; RV32V-NEXT:    slli a2, a2, 4
-; RV32V-NEXT:    add a2, a2, a5
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 7
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 3
-; RV32V-NEXT:    mv a5, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a5, a5, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a5, a5, a2
-; RV32V-NEXT:    slli a2, a2, 2
-; RV32V-NEXT:    add a2, a2, a5
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (a3), zero
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 7
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 288
-; RV32V-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a4), zero
-; RV32V-NEXT:    vlse64.v v24, (a1), zero
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 4
-; RV32V-NEXT:    mv a2, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a2, a2, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a1, a1, a2
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s4), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (s11), zero
-; RV32V-NEXT:    vlse64.v v24, (s8), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (s10), zero
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v0, v8, v24
-; RV32V-NEXT:    lui a0, 262144
-; RV32V-NEXT:    vand.vx v8, v8, a0
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vmul.vv v16, v16, v0
-; RV32V-NEXT:    vxor.vv v16, v8, v16
-; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vnsrl.wx v8, v16, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv8i32_vx:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64V-NEXT:    vmv.v.x v24, a0
-; RV64V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64V-NEXT:    vzext.vf2 v16, v8
-; RV64V-NEXT:    li a1, 16
-; RV64V-NEXT:    li a0, 32
-; RV64V-NEXT:    li a2, 64
-; RV64V-NEXT:    vzext.vf2 v8, v24
-; RV64V-NEXT:    vand.vi v24, v8, 2
-; RV64V-NEXT:    vand.vi v0, v8, 1
-; RV64V-NEXT:    vmul.vv v24, v16, v24
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v0, v24
-; RV64V-NEXT:    vand.vi v0, v8, 4
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vi v0, v8, 8
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a1
-; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a0
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a1
-; RV64V-NEXT:    li a1, 512
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    li a2, 1024
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a1
-; RV64V-NEXT:    li a1, 1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    slli a2, a1, 11
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 4
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 8
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 32
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 64
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 128
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 4096
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 8192
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 16384
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 32768
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 65536
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 131072
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    lui a2, 262144
-; RV64V-NEXT:    slli a1, a1, 31
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vmul.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vv v8, v16, v8
-; RV64V-NEXT:    vxor.vv v16, v24, v8
-; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64V-NEXT:    vnsrl.wx v8, v16, a0
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC-LABEL: clmulh_nxv8i32_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vmv.v.x v24, a0
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV32ZVBC-NEXT:    vzext.vf2 v8, v24
-; RV32ZVBC-NEXT:    vclmul.vv v16, v16, v8
-; RV32ZVBC-NEXT:    li a0, 32
-; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32ZVBC-NEXT:    vnsrl.wx v8, v16, a0
-; RV32ZVBC-NEXT:    ret
-;
-; RV64ZVBC-LABEL: clmulh_nxv8i32_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vmv.v.x v24, a0
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
-; RV64ZVBC-NEXT:    vzext.vf2 v8, v24
-; RV64ZVBC-NEXT:    vclmul.vv v16, v16, v8
-; RV64ZVBC-NEXT:    li a0, 32
-; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64ZVBC-NEXT:    vnsrl.wx v8, v16, a0
-; RV64ZVBC-NEXT:    ret
-  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i64 0
-  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-  %va.ext = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
-  %vb.ext = zext <vscale x 8 x i32> %vb to <vscale x 8 x i64>
-  %clmul = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %va.ext, <vscale x 8 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 8 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 8 x i64> %res.ext to <vscale x 8 x i32>
-  ret <vscale x 8 x i32> %res
-}
-
-define <vscale x 16 x i32> @clmulh_nxv16i32_vv(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb) nounwind {
-; RV32-LABEL: clmulh_nxv16i32_vv:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -80
-; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    lui a0, 16
-; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    addi a0, a0, -256
-; RV32-NEXT:    vand.vx v24, v24, a0
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    slli a1, a1, 1
-; RV32-NEXT:    add a2, a2, a1
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v16, 8
-; RV32-NEXT:    vand.vx v0, v0, a0
-; RV32-NEXT:    vsrl.vi v24, v16, 24
-; RV32-NEXT:    vor.vv v0, v0, v24
-; RV32-NEXT:    vsll.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsll.vi v24, v16, 24
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    lui t6, 349525
-; RV32-NEXT:    li s5, 16
-; RV32-NEXT:    li t2, 32
-; RV32-NEXT:    li a7, 256
-; RV32-NEXT:    li t0, 512
-; RV32-NEXT:    li t1, 1024
-; RV32-NEXT:    li s6, 1
-; RV32-NEXT:    lui t3, 1
-; RV32-NEXT:    lui t4, 2
-; RV32-NEXT:    lui t5, 4
-; RV32-NEXT:    lui s0, 8
-; RV32-NEXT:    lui s1, 32
-; RV32-NEXT:    lui s2, 64
-; RV32-NEXT:    lui s3, 128
-; RV32-NEXT:    lui s4, 256
-; RV32-NEXT:    lui s7, 512
-; RV32-NEXT:    lui s8, 1024
-; RV32-NEXT:    lui s9, 2048
-; RV32-NEXT:    lui s10, 4096
-; RV32-NEXT:    lui s11, 8192
-; RV32-NEXT:    lui ra, 16384
-; RV32-NEXT:    lui a5, 32768
-; RV32-NEXT:    addi a3, a1, -241
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    addi a1, t6, 1365
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsrl.vi v24, v16, 4
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vsll.vi v16, v16, 4
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vsll.vi v16, v16, 2
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
-; RV32-NEXT:    vadd.vv v16, v16, v16
-; RV32-NEXT:    vor.vv v0, v24, v16
-; RV32-NEXT:    vand.vx v16, v0, s5
-; RV32-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui t6, 65536
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    add a4, a4, a0
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vand.vx v16, v0, t2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    add a4, a4, a0
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui a4, 131072
-; RV32-NEXT:    slli s6, s6, 11
-; RV32-NEXT:    lui t2, 262144
-; RV32-NEXT:    lui s5, 524288
-; RV32-NEXT:    li a6, 64
-; RV32-NEXT:    vand.vx v16, v0, a6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a0, a6
-; RV32-NEXT:    slli a6, a6, 5
-; RV32-NEXT:    add a6, a6, a0
-; RV32-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    li a6, 128
-; RV32-NEXT:    vand.vx v16, v0, a6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 8
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, a7
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t0
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t1
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 5
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t3
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t4
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t5
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s0
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 6
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui a6, 16
-; RV32-NEXT:    vand.vx v16, v0, a6
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s1
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s2
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s3
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 5
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s4
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s7
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s8
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s9
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s10
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s11
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 4
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, ra
-; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    slli a6, a6, 3
-; RV32-NEXT:    mv a7, a6
-; RV32-NEXT:    slli a6, a6, 2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    slli a6, a6, 1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, sp, a6
-; RV32-NEXT:    addi a6, a6, 16
-; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, a5
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 5
-; RV32-NEXT:    mv a6, a5
-; RV32-NEXT:    slli a5, a5, 1
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t6
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 3
-; RV32-NEXT:    mv a6, a5
-; RV32-NEXT:    slli a5, a5, 1
-; RV32-NEXT:    add a6, a6, a5
-; RV32-NEXT:    slli a5, a5, 2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vi v16, v0, 2
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vi v24, v0, 1
-; RV32-NEXT:    vand.vi v16, v0, 4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vi v16, v0, 8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, t2
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vx v16, v0, s5
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vmul.vv v16, v8, v24
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v24, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v0, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 7
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 7
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v16, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v16, v8
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 6
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 7
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v24, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v0, v24, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v0, v0, v8
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vx v8, v24, a0
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v16, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v0, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a5, a5, a4
-; RV32-NEXT:    slli a4, a4, 1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 8
-; RV32-NEXT:    vand.vx v24, v24, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v8, v8, 1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 80
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: clmulh_nxv16i32_vv:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -144
-; RV64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    addi a0, a0, -256
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add a2, a2, a1
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 32
-; RV64-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vsrl.vi v0, v16, 8
-; RV64-NEXT:    vand.vx v0, v0, a0
-; RV64-NEXT:    vsrl.vi v24, v16, 24
-; RV64-NEXT:    vor.vv v0, v0, v24
-; RV64-NEXT:    vsll.vi v24, v8, 24
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v8, v24, v8
-; RV64-NEXT:    vsll.vi v24, v16, 24
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui t6, 349525
-; RV64-NEXT:    li s5, 16
-; RV64-NEXT:    li t2, 32
-; RV64-NEXT:    li a7, 256
-; RV64-NEXT:    li t0, 512
-; RV64-NEXT:    li t1, 1024
-; RV64-NEXT:    li s6, 1
-; RV64-NEXT:    lui t3, 1
-; RV64-NEXT:    lui t4, 2
-; RV64-NEXT:    lui t5, 4
-; RV64-NEXT:    lui s0, 8
-; RV64-NEXT:    lui s1, 32
-; RV64-NEXT:    lui s2, 64
-; RV64-NEXT:    lui s3, 128
-; RV64-NEXT:    lui s4, 256
-; RV64-NEXT:    lui s7, 512
-; RV64-NEXT:    lui s8, 1024
-; RV64-NEXT:    lui s9, 2048
-; RV64-NEXT:    lui s10, 4096
-; RV64-NEXT:    lui s11, 8192
-; RV64-NEXT:    lui ra, 16384
-; RV64-NEXT:    lui a5, 32768
-; RV64-NEXT:    addi a3, a1, -241
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a1, t6, 1365
-; RV64-NEXT:    vor.vv v16, v16, v0
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsll.vi v16, v16, 4
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vadd.vv v16, v16, v16
-; RV64-NEXT:    vor.vv v0, v24, v16
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui t6, 65536
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a4, 131072
-; RV64-NEXT:    slli s6, s6, 11
-; RV64-NEXT:    lui t2, 262144
-; RV64-NEXT:    lui s5, 524288
-; RV64-NEXT:    li a6, 64
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a0, a6
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    add a6, a6, a0
-; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    li a6, 128
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 8
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t5
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 6
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a6, 16
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s2
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s8
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s9
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s10
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s11
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, ra
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a5
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 5
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t6
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 3
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a6, a6, a5
-; RV64-NEXT:    slli a5, a5, 2
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v24, v0, 1
-; RV64-NEXT:    vand.vi v16, v0, 4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vmul.vv v16, v8, v24
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v24, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v0, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v8, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    vxor.vv v16, v16, v24
-; RV64-NEXT:    vxor.vv v16, v16, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v16, v8
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v0, v8
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    vand.vx v8, v24, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v0, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    vsrl.vi v24, v24, 8
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    vor.vv v8, v24, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v8, v8, 1
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 144
-; RV64-NEXT:    ret
-  %va.ext = zext <vscale x 16 x i32> %va to <vscale x 16 x i64>
-  %vb.ext = zext <vscale x 16 x i32> %vb to <vscale x 16 x i64>
-  %clmul = call <vscale x 16 x i64> @llvm.clmul.nxv16i64(<vscale x 16 x i64> %va.ext, <vscale x 16 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 16 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 16 x i64> %res.ext to <vscale x 16 x i32>
-  ret <vscale x 16 x i32> %res
-}
-
-define <vscale x 16 x i32> @clmulh_nxv16i32_vx(<vscale x 16 x i32> %va, i32 %b) nounwind {
-; RV32-LABEL: clmulh_nxv16i32_vx:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    lui a4, 16
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vsll.vi v0, v8, 24
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    lui a5, 209715
-; RV32-NEXT:    lui a7, 349525
-; RV32-NEXT:    srli a3, a0, 8
-; RV32-NEXT:    srli a6, a0, 24
-; RV32-NEXT:    addi a1, a4, -256
-; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    or t0, a3, a6
-; RV32-NEXT:    slli a3, a0, 24
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    slli a0, a0, 8
-; RV32-NEXT:    or t1, a3, a0
-; RV32-NEXT:    li a6, 1
-; RV32-NEXT:    addi a3, a2, -241
-; RV32-NEXT:    addi a2, a5, 819
-; RV32-NEXT:    addi a0, a7, 1365
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v0, v8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    or a5, t1, t0
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    srli a7, a5, 4
-; RV32-NEXT:    and a5, a5, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    and a7, a7, a3
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    or a5, a7, a5
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    srli a7, a5, 2
-; RV32-NEXT:    and a5, a5, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    and a7, a7, a2
-; RV32-NEXT:    slli a5, a5, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    or a5, a7, a5
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    srli a7, a5, 1
-; RV32-NEXT:    and a5, a5, a0
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    and a7, a7, a0
-; RV32-NEXT:    slli a5, a5, 1
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    or a5, a7, a5
-; RV32-NEXT:    andi a7, a5, 2
-; RV32-NEXT:    vmul.vx v16, v8, a7
-; RV32-NEXT:    andi a7, a5, 1
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 4
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 8
-; RV32-NEXT:    vxor.vv v16, v24, v16
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 16
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 32
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 64
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 128
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 256
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    andi a7, a5, 512
-; RV32-NEXT:    vxor.vv v16, v16, v24
-; RV32-NEXT:    csrr t0, vlenb
-; RV32-NEXT:    slli t0, t0, 3
-; RV32-NEXT:    add t0, sp, t0
-; RV32-NEXT:    addi t0, t0, 16
-; RV32-NEXT:    vs8r.v v16, (t0) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vmul.vx v24, v8, a7
-; RV32-NEXT:    andi a7, a5, 1024
-; RV32-NEXT:    vxor.vv v0, v16, v0
-; RV32-NEXT:    vxor.vv v24, v0, v24
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 1
-; RV32-NEXT:    slli a6, a6, 11
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 2
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 4
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 8
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 32
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 64
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    lui a4, 128
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 256
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 512
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    lui a4, 1024
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    lui a7, 2048
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    lui a6, 4096
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    lui a4, 8192
-; RV32-NEXT:    and a7, a5, a7
-; RV32-NEXT:    and a6, a5, a6
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a7
-; RV32-NEXT:    vxor.vv v24, v24, v0
-; RV32-NEXT:    vmul.vx v0, v8, a6
-; RV32-NEXT:    vxor.vv v0, v24, v0
-; RV32-NEXT:    vmul.vx v16, v8, a4
-; RV32-NEXT:    vxor.vv v16, v0, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vx v0, v24, a1
-; RV32-NEXT:    vsll.vi v0, v0, 8
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui a4, 16384
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vxor.vv v0, v16, v0
-; RV32-NEXT:    lui a4, 32768
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v16, v8, a4
-; RV32-NEXT:    vxor.vv v16, v0, v16
-; RV32-NEXT:    lui a4, 65536
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    lui a4, 131072
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    lui a4, 262144
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v0, v8, a4
-; RV32-NEXT:    vxor.vv v16, v16, v0
-; RV32-NEXT:    lui a4, 524288
-; RV32-NEXT:    and a4, a5, a4
-; RV32-NEXT:    vmul.vx v8, v8, a4
-; RV32-NEXT:    vxor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v24, 8
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vx v8, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a2
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v8, v8, 1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: clmulh_nxv16i32_vx:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -144
-; RV64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 5
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v0, a0
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vsrl.vi v24, v8, 24
-; RV64-NEXT:    addi a0, a0, -256
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add a2, a2, a1
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 32
-; RV64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vsrl.vi v24, v0, 8
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vsrl.vi v16, v0, 24
-; RV64-NEXT:    vor.vv v24, v24, v16
-; RV64-NEXT:    vsll.vi v16, v8, 24
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsll.vi v16, v0, 24
-; RV64-NEXT:    vand.vx v0, v0, a0
-; RV64-NEXT:    vsll.vi v0, v0, 8
-; RV64-NEXT:    vor.vv v0, v16, v0
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui t6, 349525
-; RV64-NEXT:    li s5, 16
-; RV64-NEXT:    li t2, 32
-; RV64-NEXT:    li a7, 256
-; RV64-NEXT:    li t0, 512
-; RV64-NEXT:    li t1, 1024
-; RV64-NEXT:    li s6, 1
-; RV64-NEXT:    lui t3, 1
-; RV64-NEXT:    lui t4, 2
-; RV64-NEXT:    lui t5, 4
-; RV64-NEXT:    lui s0, 8
-; RV64-NEXT:    lui s1, 32
-; RV64-NEXT:    lui s2, 64
-; RV64-NEXT:    lui s3, 128
-; RV64-NEXT:    lui s4, 256
-; RV64-NEXT:    lui s7, 512
-; RV64-NEXT:    lui s8, 1024
-; RV64-NEXT:    lui s9, 2048
-; RV64-NEXT:    lui s10, 4096
-; RV64-NEXT:    lui s11, 8192
-; RV64-NEXT:    lui ra, 16384
-; RV64-NEXT:    lui a5, 32768
-; RV64-NEXT:    addi a3, a1, -241
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a1, t6, 1365
-; RV64-NEXT:    vor.vv v16, v0, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsll.vi v16, v16, 4
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vadd.vv v16, v16, v16
-; RV64-NEXT:    vor.vv v0, v24, v16
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui t6, 65536
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    mv a4, a0
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a4, a4, a0
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a4, 131072
-; RV64-NEXT:    slli s6, s6, 11
-; RV64-NEXT:    lui t2, 262144
-; RV64-NEXT:    lui s5, 524288
-; RV64-NEXT:    li a6, 64
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a0, a6
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    add a6, a6, a0
-; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    li a6, 128
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 8
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t5
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s0
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 6
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a6, 16
-; RV64-NEXT:    vand.vx v16, v0, a6
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s1
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s2
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s3
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 5
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s4
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s7
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s8
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s9
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s10
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s11
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 4
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, ra
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    mv a7, a6
-; RV64-NEXT:    slli a6, a6, 2
-; RV64-NEXT:    add a7, a7, a6
-; RV64-NEXT:    slli a6, a6, 1
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 32
-; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a5
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 5
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t6
-; RV64-NEXT:    csrr a5, vlenb
-; RV64-NEXT:    slli a5, a5, 3
-; RV64-NEXT:    mv a6, a5
-; RV64-NEXT:    slli a5, a5, 1
-; RV64-NEXT:    add a6, a6, a5
-; RV64-NEXT:    slli a5, a5, 2
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, sp, a5
-; RV64-NEXT:    addi a5, a5, 32
-; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, a4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v24, v0, 1
-; RV64-NEXT:    vand.vi v16, v0, 4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vi v16, v0, 8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, t2
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vand.vx v16, v0, s5
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vmul.vv v16, v8, v24
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v24, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v0, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vmul.vv v8, v8, v16
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    vxor.vv v16, v16, v24
-; RV64-NEXT:    vxor.vv v16, v16, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v16, v8
-; RV64-NEXT:    addi a4, sp, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 6
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 7
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v24, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v24, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v0, v0, v8
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    vand.vx v8, v24, a0
-; RV64-NEXT:    vsll.vi v8, v8, 8
-; RV64-NEXT:    vor.vv v16, v16, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v0, v8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 8
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a5, a5, a4
-; RV64-NEXT:    slli a4, a4, 1
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 32
-; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT:    vxor.vv v8, v8, v0
-; RV64-NEXT:    vsrl.vi v24, v24, 8
-; RV64-NEXT:    vand.vx v24, v24, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    vor.vv v8, v24, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v8, v8, 4
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vsll.vi v8, v8, 2
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v8, v8, 1
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 144
-; RV64-NEXT:    ret
-  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i64 0
-  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-  %va.ext = zext <vscale x 16 x i32> %va to <vscale x 16 x i64>
-  %vb.ext = zext <vscale x 16 x i32> %vb to <vscale x 16 x i64>
-  %clmul = call <vscale x 16 x i64> @llvm.clmul.nxv16i64(<vscale x 16 x i64> %va.ext, <vscale x 16 x i64> %vb.ext)
-  %res.ext = lshr <vscale x 16 x i64> %clmul, splat(i64 32)
-  %res = trunc <vscale x 16 x i64> %res.ext to <vscale x 16 x i32>
-  ret <vscale x 16 x i32> %res
-}
-
-define <vscale x 1 x i64> @clmulh_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv1i64_vv:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    lui s7, 1044480
-; RV32V-NEXT:    lui a7, 524288
-; RV32V-NEXT:    li s11, 1
-; RV32V-NEXT:    li s8, 2
-; RV32V-NEXT:    li s9, 4
-; RV32V-NEXT:    li s10, 8
-; RV32V-NEXT:    li a3, 16
-; RV32V-NEXT:    li a4, 32
-; RV32V-NEXT:    li a5, 64
-; RV32V-NEXT:    li a6, 128
-; RV32V-NEXT:    li ra, 256
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    li a1, 1024
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    lui t0, 2
-; RV32V-NEXT:    lui t1, 4
-; RV32V-NEXT:    lui t2, 8
-; RV32V-NEXT:    lui t3, 16
-; RV32V-NEXT:    lui t4, 32
-; RV32V-NEXT:    lui t5, 64
-; RV32V-NEXT:    lui t6, 128
-; RV32V-NEXT:    lui s0, 256
-; RV32V-NEXT:    lui s1, 512
-; RV32V-NEXT:    lui s2, 1024
-; RV32V-NEXT:    lui s3, 2048
-; RV32V-NEXT:    lui s4, 4096
-; RV32V-NEXT:    lui s5, 8192
-; RV32V-NEXT:    lui s6, 16384
-; RV32V-NEXT:    sw s7, 272(sp)
-; RV32V-NEXT:    lui s7, 32768
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw a7, 264(sp)
-; RV32V-NEXT:    sw zero, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s11, 260(sp)
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s8, 252(sp)
-; RV32V-NEXT:    lui s8, 65536
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s9, 244(sp)
-; RV32V-NEXT:    lui s9, 131072
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s10, 236(sp)
-; RV32V-NEXT:    lui s10, 262144
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw a3, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw a4, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw a5, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw a6, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw ra, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw a0, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a1, 180(sp)
-; RV32V-NEXT:    slli s11, s11, 11
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s11, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw a2, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t0, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t2, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t3, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t4, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t6, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw s0, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw s1, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw s2, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw s3, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw s4, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s5, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw s6, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s7, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw s8, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw s9, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw s10, 20(sp)
-; RV32V-NEXT:    sw zero, 8(sp)
-; RV32V-NEXT:    sw a7, 12(sp)
-; RV32V-NEXT:    lui a0, 61681
-; RV32V-NEXT:    addi a0, a0, -241
-; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vmv.v.x v1, a0
-; RV32V-NEXT:    lui a0, 209715
-; RV32V-NEXT:    addi a0, a0, 819
-; RV32V-NEXT:    vmv.v.x v0, a0
-; RV32V-NEXT:    lui a0, 349525
-; RV32V-NEXT:    addi a0, a0, 1365
-; RV32V-NEXT:    vmv.v.x v10, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 272
-; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32V-NEXT:    vlse64.v v3, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 264
-; RV32V-NEXT:    vlse64.v v14, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 256
-; RV32V-NEXT:    vlse64.v v15, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 248
-; RV32V-NEXT:    vlse64.v v16, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 240
-; RV32V-NEXT:    vlse64.v v17, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 232
-; RV32V-NEXT:    vlse64.v v18, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 224
-; RV32V-NEXT:    vlse64.v v19, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 216
-; RV32V-NEXT:    vlse64.v v20, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 208
-; RV32V-NEXT:    vlse64.v v21, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 200
-; RV32V-NEXT:    vlse64.v v22, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 192
-; RV32V-NEXT:    vlse64.v v23, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 184
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 176
-; RV32V-NEXT:    vlse64.v v25, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 168
-; RV32V-NEXT:    vlse64.v v26, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 160
-; RV32V-NEXT:    vlse64.v v27, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 152
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 144
-; RV32V-NEXT:    vlse64.v v29, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 136
-; RV32V-NEXT:    vlse64.v v30, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 128
-; RV32V-NEXT:    vlse64.v v31, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 120
-; RV32V-NEXT:    vlse64.v v7, (a0), zero
-; RV32V-NEXT:    li a6, 56
-; RV32V-NEXT:    vsrl.vi v10, v8, 24
-; RV32V-NEXT:    vsrl.vi v11, v8, 8
-; RV32V-NEXT:    vsrl.vx v12, v8, a6
-; RV32V-NEXT:    li ra, 40
-; RV32V-NEXT:    vsrl.vx v13, v8, ra
-; RV32V-NEXT:    vsll.vx v6, v8, a6
-; RV32V-NEXT:    vsrl.vx v5, v9, a6
-; RV32V-NEXT:    vsrl.vx v4, v9, ra
-; RV32V-NEXT:    addi a4, t3, -256
-; RV32V-NEXT:    vand.vx v13, v13, a4
-; RV32V-NEXT:    vor.vv v12, v13, v12
-; RV32V-NEXT:    vsll.vx v13, v9, a6
-; RV32V-NEXT:    vand.vx v4, v4, a4
-; RV32V-NEXT:    vor.vv v5, v4, v5
-; RV32V-NEXT:    vand.vx v4, v8, a4
-; RV32V-NEXT:    vsll.vx v4, v4, ra
-; RV32V-NEXT:    vor.vv v6, v6, v4
-; RV32V-NEXT:    vand.vx v4, v9, a4
-; RV32V-NEXT:    vsll.vx v4, v4, ra
-; RV32V-NEXT:    vor.vv v4, v13, v4
-; RV32V-NEXT:    vsrl.vi v13, v9, 24
-; RV32V-NEXT:    lui a5, 4080
-; RV32V-NEXT:    vand.vx v10, v10, a5
-; RV32V-NEXT:    vand.vv v11, v11, v3
-; RV32V-NEXT:    vor.vv v11, v11, v10
-; RV32V-NEXT:    vsrl.vi v10, v9, 8
-; RV32V-NEXT:    vand.vx v13, v13, a5
-; RV32V-NEXT:    vand.vv v10, v10, v3
-; RV32V-NEXT:    vor.vv v13, v10, v13
-; RV32V-NEXT:    addi a3, sp, 112
-; RV32V-NEXT:    vlse64.v v10, (a3), zero
-; RV32V-NEXT:    vor.vv v2, v11, v12
-; RV32V-NEXT:    vand.vx v11, v8, a5
-; RV32V-NEXT:    vsll.vi v11, v11, 24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v3
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vor.vv v8, v11, v8
-; RV32V-NEXT:    addi a3, sp, 104
-; RV32V-NEXT:    vlse64.v v11, (a3), zero
-; RV32V-NEXT:    vor.vv v5, v13, v5
-; RV32V-NEXT:    vand.vx v12, v9, a5
-; RV32V-NEXT:    vsll.vi v12, v12, 24
-; RV32V-NEXT:    vand.vv v9, v9, v3
-; RV32V-NEXT:    vsll.vi v9, v9, 8
-; RV32V-NEXT:    vor.vv v9, v12, v9
-; RV32V-NEXT:    addi a3, sp, 96
-; RV32V-NEXT:    vlse64.v v12, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v6, v8
-; RV32V-NEXT:    addi a3, sp, 88
-; RV32V-NEXT:    vlse64.v v13, (a3), zero
-; RV32V-NEXT:    vor.vv v9, v4, v9
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    vlse64.v v3, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v8, v2
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    vlse64.v v2, (a3), zero
-; RV32V-NEXT:    vor.vv v9, v9, v5
-; RV32V-NEXT:    vsrl.vi v6, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v1
-; RV32V-NEXT:    vand.vv v6, v6, v1
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v6, v8
-; RV32V-NEXT:    vsrl.vi v6, v9, 4
-; RV32V-NEXT:    vand.vv v9, v9, v1
-; RV32V-NEXT:    vand.vv v6, v6, v1
-; RV32V-NEXT:    vsll.vi v9, v9, 4
-; RV32V-NEXT:    vor.vv v9, v6, v9
-; RV32V-NEXT:    vsrl.vi v6, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v6, v6, v0
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v6, v8
-; RV32V-NEXT:    vsrl.vi v6, v9, 2
-; RV32V-NEXT:    vand.vv v9, v9, v0
-; RV32V-NEXT:    vand.vv v6, v6, v0
-; RV32V-NEXT:    vsll.vi v9, v9, 2
-; RV32V-NEXT:    vor.vv v6, v6, v9
-; RV32V-NEXT:    vsrl.vi v9, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v5
-; RV32V-NEXT:    vand.vv v9, v9, v5
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v9, v9, v8
-; RV32V-NEXT:    vsrl.vi v8, v6, 1
-; RV32V-NEXT:    vand.vv v6, v6, v5
-; RV32V-NEXT:    vand.vv v8, v8, v5
-; RV32V-NEXT:    vadd.vv v6, v6, v6
-; RV32V-NEXT:    vor.vv v8, v8, v6
-; RV32V-NEXT:    addi a3, sp, 64
-; RV32V-NEXT:    vlse64.v v1, (a3), zero
-; RV32V-NEXT:    vand.vv v14, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v15
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v17
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v19
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v21
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v22
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v23
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v25
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v26
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v27
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v29
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v30
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v31
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v13
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 56
-; RV32V-NEXT:    addi a1, sp, 48
-; RV32V-NEXT:    addi a0, sp, 40
-; RV32V-NEXT:    vlse64.v v10, (a3), zero
-; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vand.vv v13, v8, v1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a2, sp, 32
-; RV32V-NEXT:    addi a1, sp, 24
-; RV32V-NEXT:    addi a3, sp, 16
-; RV32V-NEXT:    addi a0, sp, 8
-; RV32V-NEXT:    vlse64.v v10, (a2), zero
-; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    vlse64.v v12, (a3), zero
-; RV32V-NEXT:    vlse64.v v13, (a0), zero
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v13
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vx v2, v8, a0
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v1, v8, a0
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v0, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vand.vx v13, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v14, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v15, v8, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v16, v8, a0
-; RV32V-NEXT:    vand.vx v17, v8, s11
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vand.vx v18, v8, a0
-; RV32V-NEXT:    vand.vx v19, v8, t0
-; RV32V-NEXT:    vand.vx v20, v8, t1
-; RV32V-NEXT:    vand.vx v21, v8, t2
-; RV32V-NEXT:    vand.vx v22, v8, t3
-; RV32V-NEXT:    vand.vx v23, v8, t4
-; RV32V-NEXT:    vand.vx v24, v8, t5
-; RV32V-NEXT:    vand.vx v25, v8, t6
-; RV32V-NEXT:    vand.vx v26, v8, s0
-; RV32V-NEXT:    vand.vx v27, v8, s1
-; RV32V-NEXT:    vand.vx v28, v8, s2
-; RV32V-NEXT:    vand.vx v29, v8, s3
-; RV32V-NEXT:    vand.vx v30, v8, s4
-; RV32V-NEXT:    vand.vx v31, v8, s5
-; RV32V-NEXT:    vand.vx v7, v8, s6
-; RV32V-NEXT:    vand.vx v6, v8, s7
-; RV32V-NEXT:    vand.vx v5, v8, s8
-; RV32V-NEXT:    vand.vx v4, v8, s9
-; RV32V-NEXT:    vand.vx v3, v8, s10
-; RV32V-NEXT:    vand.vi v10, v8, 2
-; RV32V-NEXT:    vand.vi v11, v8, 1
-; RV32V-NEXT:    vand.vi v12, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v2, v9, v2
-; RV32V-NEXT:    vmul.vv v1, v9, v1
-; RV32V-NEXT:    vmul.vv v0, v9, v0
-; RV32V-NEXT:    vmul.vv v13, v9, v13
-; RV32V-NEXT:    vmul.vv v14, v9, v14
-; RV32V-NEXT:    vmul.vv v15, v9, v15
-; RV32V-NEXT:    vmul.vv v16, v9, v16
-; RV32V-NEXT:    vmul.vv v17, v9, v17
-; RV32V-NEXT:    vmul.vv v18, v9, v18
-; RV32V-NEXT:    vmul.vv v19, v9, v19
-; RV32V-NEXT:    vmul.vv v20, v9, v20
-; RV32V-NEXT:    vmul.vv v21, v9, v21
-; RV32V-NEXT:    vmul.vv v22, v9, v22
-; RV32V-NEXT:    vmul.vv v23, v9, v23
-; RV32V-NEXT:    vmul.vv v24, v9, v24
-; RV32V-NEXT:    vmul.vv v25, v9, v25
-; RV32V-NEXT:    vmul.vv v26, v9, v26
-; RV32V-NEXT:    vmul.vv v27, v9, v27
-; RV32V-NEXT:    vmul.vv v28, v9, v28
-; RV32V-NEXT:    vmul.vv v29, v9, v29
-; RV32V-NEXT:    vmul.vv v30, v9, v30
-; RV32V-NEXT:    vmul.vv v31, v9, v31
-; RV32V-NEXT:    vmul.vv v7, v9, v7
-; RV32V-NEXT:    vmul.vv v6, v9, v6
-; RV32V-NEXT:    vmul.vv v5, v9, v5
-; RV32V-NEXT:    vmul.vv v4, v9, v4
-; RV32V-NEXT:    vmul.vv v3, v9, v3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v9, v8
-; RV32V-NEXT:    vxor.vi v11, v11, 0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v11, v11, v8
-; RV32V-NEXT:    vxor.vv v11, v11, v12
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v11, v8
-; RV32V-NEXT:    vxor.vv v8, v8, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v1
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v13
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v15
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v17
-; RV32V-NEXT:    vxor.vv v8, v8, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v19
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v21
-; RV32V-NEXT:    vxor.vv v8, v8, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v23
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v25
-; RV32V-NEXT:    vxor.vv v8, v8, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v27
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v29
-; RV32V-NEXT:    vxor.vv v8, v8, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v31
-; RV32V-NEXT:    vxor.vv v8, v8, v7
-; RV32V-NEXT:    vxor.vv v8, v8, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v5
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v3
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vsrl.vx v9, v8, a6
-; RV32V-NEXT:    vsll.vx v10, v8, a6
-; RV32V-NEXT:    vsrl.vx v11, v8, ra
-; RV32V-NEXT:    vand.vx v12, v8, a4
-; RV32V-NEXT:    vand.vx v11, v11, a4
-; RV32V-NEXT:    vsrl.vi v13, v8, 24
-; RV32V-NEXT:    vand.vx v14, v8, a5
-; RV32V-NEXT:    vand.vx v13, v13, a5
-; RV32V-NEXT:    vsll.vx v12, v12, ra
-; RV32V-NEXT:    vsrl.vi v15, v8, 8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v15, v15, v16
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v11, v15, v13
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vsll.vi v13, v14, 24
-; RV32V-NEXT:    vor.vv v8, v13, v8
-; RV32V-NEXT:    vor.vv v10, v10, v12
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vor.vv v8, v8, v9
-; RV32V-NEXT:    vsrl.vi v9, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv1i64_vv:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi sp, sp, -208
-; RV64V-NEXT:    sd ra, 200(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s0, 192(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s1, 184(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s2, 176(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s3, 168(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s4, 160(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s5, 152(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s6, 144(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s7, 136(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s8, 128(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s9, 120(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s10, 112(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s11, 104(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    sub sp, sp, a0
-; RV64V-NEXT:    li s6, 56
-; RV64V-NEXT:    lui t4, 16
-; RV64V-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64V-NEXT:    vsrl.vi v16, v8, 24
-; RV64V-NEXT:    vsrl.vi v10, v8, 8
-; RV64V-NEXT:    li t3, 255
-; RV64V-NEXT:    lui a0, 61681
-; RV64V-NEXT:    lui a1, 209715
-; RV64V-NEXT:    lui a5, 349525
-; RV64V-NEXT:    vsrl.vi v12, v9, 24
-; RV64V-NEXT:    vsrl.vi v11, v9, 8
-; RV64V-NEXT:    li ra, 16
-; RV64V-NEXT:    li s11, 32
-; RV64V-NEXT:    li s10, 64
-; RV64V-NEXT:    li s8, 128
-; RV64V-NEXT:    li s9, 256
-; RV64V-NEXT:    li a3, 512
-; RV64V-NEXT:    li a4, 1024
-; RV64V-NEXT:    li t0, 1
-; RV64V-NEXT:    lui s5, 1
-; RV64V-NEXT:    lui s4, 2
-; RV64V-NEXT:    lui s3, 4
-; RV64V-NEXT:    lui a7, 8
-; RV64V-NEXT:    lui t1, 32
-; RV64V-NEXT:    lui t2, 64
-; RV64V-NEXT:    lui s1, 128
-; RV64V-NEXT:    lui s2, 256
-; RV64V-NEXT:    addi t5, a0, -241
-; RV64V-NEXT:    addi t6, a1, 819
-; RV64V-NEXT:    addi s0, a5, 1365
-; RV64V-NEXT:    slli a0, t5, 32
-; RV64V-NEXT:    add t5, t5, a0
-; RV64V-NEXT:    slli a0, t6, 32
-; RV64V-NEXT:    add t6, t6, a0
-; RV64V-NEXT:    slli a0, s0, 32
-; RV64V-NEXT:    add s0, s0, a0
-; RV64V-NEXT:    addi t4, t4, -256
-; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    slli t3, t3, 24
-; RV64V-NEXT:    vsrl.vx v13, v9, s6
-; RV64V-NEXT:    li a0, 40
-; RV64V-NEXT:    vsrl.vx v14, v9, a0
-; RV64V-NEXT:    lui a1, 4080
-; RV64V-NEXT:    vand.vx v12, v12, a1
-; RV64V-NEXT:    vand.vx v15, v9, a1
-; RV64V-NEXT:    vsll.vx v17, v9, s6
-; RV64V-NEXT:    vand.vx v14, v14, t4
-; RV64V-NEXT:    vand.vx v11, v11, t3
-; RV64V-NEXT:    vsll.vi v15, v15, 24
-; RV64V-NEXT:    vand.vx v18, v9, t3
-; RV64V-NEXT:    vand.vx v9, v9, t4
-; RV64V-NEXT:    vor.vv v13, v14, v13
-; RV64V-NEXT:    vor.vv v11, v11, v12
-; RV64V-NEXT:    vsll.vi v12, v18, 8
-; RV64V-NEXT:    vsll.vx v9, v9, a0
-; RV64V-NEXT:    li a5, 40
-; RV64V-NEXT:    vor.vv v11, v11, v13
-; RV64V-NEXT:    vor.vv v12, v15, v12
-; RV64V-NEXT:    vor.vv v9, v17, v9
-; RV64V-NEXT:    vor.vv v9, v9, v12
-; RV64V-NEXT:    vor.vv v9, v9, v11
-; RV64V-NEXT:    vsrl.vi v11, v9, 4
-; RV64V-NEXT:    vand.vx v9, v9, t5
-; RV64V-NEXT:    vand.vx v11, v11, t5
-; RV64V-NEXT:    vsll.vi v9, v9, 4
-; RV64V-NEXT:    vor.vv v9, v11, v9
-; RV64V-NEXT:    vsrl.vi v11, v9, 2
-; RV64V-NEXT:    vand.vx v9, v9, t6
-; RV64V-NEXT:    vand.vx v11, v11, t6
-; RV64V-NEXT:    vsll.vi v9, v9, 2
-; RV64V-NEXT:    vor.vv v9, v11, v9
-; RV64V-NEXT:    vsrl.vi v11, v9, 1
-; RV64V-NEXT:    vand.vx v9, v9, s0
-; RV64V-NEXT:    vand.vx v11, v11, s0
-; RV64V-NEXT:    vadd.vv v9, v9, v9
-; RV64V-NEXT:    vor.vv v11, v11, v9
-; RV64V-NEXT:    vand.vx v14, v11, ra
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vand.vx v15, v11, s11
-; RV64V-NEXT:    lui a1, 8192
-; RV64V-NEXT:    vand.vx v17, v11, s10
-; RV64V-NEXT:    lui a6, 16384
-; RV64V-NEXT:    vand.vx v18, v11, s8
-; RV64V-NEXT:    lui s6, 32768
-; RV64V-NEXT:    vand.vx v19, v11, s9
-; RV64V-NEXT:    lui s7, 65536
-; RV64V-NEXT:    vand.vx v20, v11, a3
-; RV64V-NEXT:    lui s8, 131072
-; RV64V-NEXT:    vand.vx v21, v11, a4
-; RV64V-NEXT:    slli a3, t0, 11
-; RV64V-NEXT:    vand.vx v22, v11, a3
-; RV64V-NEXT:    lui s10, 262144
-; RV64V-NEXT:    li a3, 56
-; RV64V-NEXT:    vsrl.vx v4, v8, a3
-; RV64V-NEXT:    vsrl.vx v0, v8, a5
-; RV64V-NEXT:    li a5, 40
-; RV64V-NEXT:    lui a4, 4080
-; RV64V-NEXT:    vand.vx v1, v16, a4
-; RV64V-NEXT:    vand.vx v3, v8, a4
-; RV64V-NEXT:    vsll.vx v2, v8, a3
-; RV64V-NEXT:    vand.vx v23, v11, s5
-; RV64V-NEXT:    slli s11, t0, 31
-; RV64V-NEXT:    vand.vx v24, v11, s4
-; RV64V-NEXT:    slli ra, t0, 32
-; RV64V-NEXT:    vand.vx v25, v11, s3
-; RV64V-NEXT:    slli s9, t0, 33
-; RV64V-NEXT:    vand.vx v26, v11, a7
-; RV64V-NEXT:    slli a3, t0, 34
-; RV64V-NEXT:    sd a3, 80(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v27, v11, a2
-; RV64V-NEXT:    slli a2, t0, 35
-; RV64V-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v28, v11, t1
-; RV64V-NEXT:    slli a2, t0, 36
-; RV64V-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v29, v11, t2
-; RV64V-NEXT:    slli a2, t0, 37
-; RV64V-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v30, v11, s1
-; RV64V-NEXT:    slli a2, t0, 38
-; RV64V-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v31, v11, s2
-; RV64V-NEXT:    slli a2, t0, 39
-; RV64V-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    lui a2, 512
-; RV64V-NEXT:    vand.vx v7, v11, a2
-; RV64V-NEXT:    slli a2, t0, 40
-; RV64V-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vand.vx v6, v11, a2
-; RV64V-NEXT:    slli a2, t0, 41
-; RV64V-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vand.vx v9, v11, a2
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s2, t0, 42
-; RV64V-NEXT:    vand.vx v9, v11, a0
-; RV64V-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a2, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a2, a2, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s3, t0, 43
-; RV64V-NEXT:    vand.vx v9, v11, a1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s4, t0, 44
-; RV64V-NEXT:    vand.vx v9, v11, a6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s5, t0, 45
-; RV64V-NEXT:    vand.vx v9, v11, s6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s6, t0, 46
-; RV64V-NEXT:    vand.vx v9, v0, t4
-; RV64V-NEXT:    vor.vv v9, v9, v4
-; RV64V-NEXT:    vand.vx v12, v11, s7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s7, t0, 47
-; RV64V-NEXT:    vand.vx v10, v10, t3
-; RV64V-NEXT:    vor.vv v10, v10, v1
-; RV64V-NEXT:    vand.vx v12, v11, s8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s8, t0, 48
-; RV64V-NEXT:    vsll.vi v12, v3, 24
-; RV64V-NEXT:    vor.vv v9, v10, v9
-; RV64V-NEXT:    vand.vx v10, v8, t3
-; RV64V-NEXT:    vsll.vi v10, v10, 8
-; RV64V-NEXT:    vor.vv v10, v12, v10
-; RV64V-NEXT:    vand.vx v12, v11, s10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s10, t0, 49
-; RV64V-NEXT:    vand.vx v8, v8, t4
-; RV64V-NEXT:    vsll.vx v8, v8, a5
-; RV64V-NEXT:    vor.vv v8, v2, v8
-; RV64V-NEXT:    vand.vx v12, v11, s11
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s11, t0, 50
-; RV64V-NEXT:    vor.vv v8, v8, v10
-; RV64V-NEXT:    vand.vx v10, v11, ra
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli ra, t0, 51
-; RV64V-NEXT:    vor.vv v8, v8, v9
-; RV64V-NEXT:    vsrl.vi v9, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, t5
-; RV64V-NEXT:    vand.vx v9, v9, t5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, t6
-; RV64V-NEXT:    vand.vx v9, v9, t6
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, s0
-; RV64V-NEXT:    vand.vx v9, v9, s0
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vand.vx v9, v11, s9
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli t2, t0, 52
-; RV64V-NEXT:    slli t1, t0, 53
-; RV64V-NEXT:    slli a7, t0, 54
-; RV64V-NEXT:    slli s9, t0, 55
-; RV64V-NEXT:    slli a6, t0, 56
-; RV64V-NEXT:    slli a5, t0, 57
-; RV64V-NEXT:    slli a4, t0, 58
-; RV64V-NEXT:    slli a2, t0, 59
-; RV64V-NEXT:    slli a1, t0, 60
-; RV64V-NEXT:    slli a3, t0, 61
-; RV64V-NEXT:    slli t0, t0, 62
-; RV64V-NEXT:    li a0, -1
-; RV64V-NEXT:    slli a0, a0, 63
-; RV64V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 5
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s1, 72(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s2, s1, 5
-; RV64V-NEXT:    sub s1, s2, s1
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s1, 64(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v9, v11, s1
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 3
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s2
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 3
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s3
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s4
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s5
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s6
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 2
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s7
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    add s2, s2, s1
-; RV64V-NEXT:    slli s1, s1, 3
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s8
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 1
-; RV64V-NEXT:    mv s2, s1
-; RV64V-NEXT:    slli s1, s1, 3
-; RV64V-NEXT:    add s1, s1, s2
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s10
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s2, s1, 4
-; RV64V-NEXT:    add s1, s2, s1
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s11
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s1, s1, 4
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, ra
-; RV64V-NEXT:    csrr s1, vlenb
-; RV64V-NEXT:    slli s2, s1, 4
-; RV64V-NEXT:    sub s1, s2, s1
-; RV64V-NEXT:    add s1, sp, s1
-; RV64V-NEXT:    addi s1, s1, 96
-; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, t2
-; RV64V-NEXT:    csrr t2, vlenb
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    mv s1, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add s1, s1, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add t2, t2, s1
-; RV64V-NEXT:    add t2, sp, t2
-; RV64V-NEXT:    addi t2, t2, 96
-; RV64V-NEXT:    vs1r.v v9, (t2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, t1
-; RV64V-NEXT:    csrr t1, vlenb
-; RV64V-NEXT:    mv t2, t1
-; RV64V-NEXT:    slli t1, t1, 2
-; RV64V-NEXT:    add t2, t2, t1
-; RV64V-NEXT:    slli t1, t1, 1
-; RV64V-NEXT:    add t1, t1, t2
-; RV64V-NEXT:    add t1, sp, t1
-; RV64V-NEXT:    addi t1, t1, 96
-; RV64V-NEXT:    vs1r.v v9, (t1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, a7
-; RV64V-NEXT:    csrr a7, vlenb
-; RV64V-NEXT:    slli a7, a7, 2
-; RV64V-NEXT:    mv t1, a7
-; RV64V-NEXT:    slli a7, a7, 1
-; RV64V-NEXT:    add a7, a7, t1
-; RV64V-NEXT:    add a7, sp, a7
-; RV64V-NEXT:    addi a7, a7, 96
-; RV64V-NEXT:    vs1r.v v9, (a7) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, s9
-; RV64V-NEXT:    csrr a7, vlenb
-; RV64V-NEXT:    mv t1, a7
-; RV64V-NEXT:    slli a7, a7, 1
-; RV64V-NEXT:    add t1, t1, a7
-; RV64V-NEXT:    slli a7, a7, 2
-; RV64V-NEXT:    add a7, a7, t1
-; RV64V-NEXT:    add a7, sp, a7
-; RV64V-NEXT:    addi a7, a7, 96
-; RV64V-NEXT:    vs1r.v v9, (a7) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, a6
-; RV64V-NEXT:    csrr a6, vlenb
-; RV64V-NEXT:    slli a6, a6, 1
-; RV64V-NEXT:    mv a7, a6
-; RV64V-NEXT:    slli a6, a6, 2
-; RV64V-NEXT:    add a6, a6, a7
-; RV64V-NEXT:    add a6, sp, a6
-; RV64V-NEXT:    addi a6, a6, 96
-; RV64V-NEXT:    vs1r.v v9, (a6) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, a5
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a6, a5, 3
-; RV64V-NEXT:    add a5, a6, a5
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 96
-; RV64V-NEXT:    vs1r.v v9, (a5) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, a4
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 96
-; RV64V-NEXT:    vs1r.v v9, (a4) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, a2
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a4, a2, 3
-; RV64V-NEXT:    sub a2, a4, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v9, v11, a1
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 96
-; RV64V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vi v9, v11, 2
-; RV64V-NEXT:    vand.vi v10, v11, 1
-; RV64V-NEXT:    vand.vi v12, v11, 4
-; RV64V-NEXT:    vand.vi v13, v11, 8
-; RV64V-NEXT:    vand.vx v5, v11, a3
-; RV64V-NEXT:    vand.vx v4, v11, t0
-; RV64V-NEXT:    vand.vx v2, v11, a0
-; RV64V-NEXT:    vmul.vv v3, v8, v9
-; RV64V-NEXT:    vmul.vv v9, v8, v10
-; RV64V-NEXT:    vmul.vv v10, v8, v12
-; RV64V-NEXT:    vmul.vv v11, v8, v13
-; RV64V-NEXT:    vmul.vv v12, v8, v14
-; RV64V-NEXT:    vmul.vv v13, v8, v15
-; RV64V-NEXT:    vmul.vv v14, v8, v17
-; RV64V-NEXT:    vmul.vv v15, v8, v18
-; RV64V-NEXT:    vmul.vv v16, v8, v19
-; RV64V-NEXT:    vmul.vv v17, v8, v20
-; RV64V-NEXT:    vmul.vv v18, v8, v21
-; RV64V-NEXT:    vmul.vv v19, v8, v22
-; RV64V-NEXT:    vmul.vv v20, v8, v23
-; RV64V-NEXT:    vmul.vv v21, v8, v24
-; RV64V-NEXT:    vmul.vv v22, v8, v25
-; RV64V-NEXT:    vmul.vv v23, v8, v26
-; RV64V-NEXT:    vmul.vv v24, v8, v27
-; RV64V-NEXT:    vmul.vv v25, v8, v28
-; RV64V-NEXT:    vmul.vv v26, v8, v29
-; RV64V-NEXT:    vmul.vv v27, v8, v30
-; RV64V-NEXT:    vmul.vv v28, v8, v31
-; RV64V-NEXT:    vmul.vv v29, v8, v7
-; RV64V-NEXT:    vmul.vv v30, v8, v6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v31, v8, v31
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v7, v8, v7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v6, v8, v6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 2
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v0, v8, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 1
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    addi a0, sp, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    sub a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    sub a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 4
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 4
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 4
-; RV64V-NEXT:    sub a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 3
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 3
-; RV64V-NEXT:    sub a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v8, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 96
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v1, v8, v5
-; RV64V-NEXT:    vmul.vv v5, v8, v4
-; RV64V-NEXT:    vmul.vv v8, v8, v2
-; RV64V-NEXT:    vxor.vv v9, v9, v3
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vxor.vv v9, v9, v12
-; RV64V-NEXT:    vxor.vv v9, v9, v13
-; RV64V-NEXT:    vxor.vv v9, v9, v14
-; RV64V-NEXT:    vxor.vv v9, v9, v15
-; RV64V-NEXT:    vxor.vv v10, v9, v16
-; RV64V-NEXT:    vxor.vv v10, v10, v17
-; RV64V-NEXT:    vxor.vv v10, v10, v18
-; RV64V-NEXT:    vxor.vv v10, v10, v19
-; RV64V-NEXT:    vxor.vv v10, v10, v20
-; RV64V-NEXT:    vxor.vv v10, v10, v21
-; RV64V-NEXT:    vxor.vv v10, v10, v22
-; RV64V-NEXT:    vxor.vv v10, v10, v23
-; RV64V-NEXT:    vxor.vv v10, v10, v24
-; RV64V-NEXT:    vxor.vv v10, v10, v25
-; RV64V-NEXT:    vxor.vv v10, v10, v26
-; RV64V-NEXT:    vxor.vv v10, v10, v27
-; RV64V-NEXT:    vxor.vv v10, v10, v28
-; RV64V-NEXT:    vxor.vv v10, v10, v29
-; RV64V-NEXT:    vxor.vv v10, v10, v30
-; RV64V-NEXT:    vxor.vv v10, v10, v31
-; RV64V-NEXT:    vxor.vv v11, v10, v7
-; RV64V-NEXT:    vxor.vv v11, v11, v6
-; RV64V-NEXT:    li a0, 56
-; RV64V-NEXT:    vsll.vx v9, v9, a0
-; RV64V-NEXT:    vand.vx v10, v10, t4
-; RV64V-NEXT:    li a1, 40
-; RV64V-NEXT:    vsll.vx v10, v10, a1
-; RV64V-NEXT:    vor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 2
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vxor.vv v10, v10, v0
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 1
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    addi a2, sp, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 5
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 5
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 5
-; RV64V-NEXT:    sub a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 4
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 4
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vsrl.vi v13, v10, 8
-; RV64V-NEXT:    vand.vx v13, v13, t3
-; RV64V-NEXT:    vsrl.vi v11, v11, 24
-; RV64V-NEXT:    lui a2, 4080
-; RV64V-NEXT:    vand.vx v11, v11, a2
-; RV64V-NEXT:    vor.vv v11, v13, v11
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 96
-; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 96
-; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 96
-; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 96
-; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 96
-; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vand.vx v10, v10, a2
-; RV64V-NEXT:    vsll.vi v10, v10, 24
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v13, v12, v13
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v14, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v13, v13, v14
-; RV64V-NEXT:    vand.vx v14, v12, t3
-; RV64V-NEXT:    vsll.vi v14, v14, 8
-; RV64V-NEXT:    vor.vv v10, v10, v14
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v14, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v13, v13, v14
-; RV64V-NEXT:    vor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 96
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v13, v10
-; RV64V-NEXT:    vsrl.vx v12, v12, a1
-; RV64V-NEXT:    vand.vx v12, v12, t4
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 96
-; RV64V-NEXT:    vl1r.v v13, (a1) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v13
-; RV64V-NEXT:    vxor.vv v10, v10, v1
-; RV64V-NEXT:    vxor.vv v10, v10, v5
-; RV64V-NEXT:    vxor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vx v8, v8, a0
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vor.vv v8, v11, v8
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, t5
-; RV64V-NEXT:    vand.vx v9, v9, t5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, t6
-; RV64V-NEXT:    vand.vx v9, v9, t6
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, s0
-; RV64V-NEXT:    vand.vx v9, v9, s0
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v8, v8, 1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add sp, sp, a0
-; RV64V-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s2, 176(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s3, 168(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s4, 160(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s5, 152(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s6, 144(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s7, 136(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s8, 128(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s9, 120(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s10, 112(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s11, 104(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    addi sp, sp, 208
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC-LABEL: clmulh_nxv1i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9
-; RV32ZVBC-NEXT:    ret
-;
-; RV64ZVBC-LABEL: clmulh_nxv1i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v9
-; RV64ZVBC-NEXT:    ret
-  %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
-  %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
-  %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 1 x i128> %res.ext to <vscale x 1 x i64>
-  ret <vscale x 1 x i64> %res
-}
-
-define <vscale x 1 x i64> @clmulh_nxv1i64_vx(<vscale x 1 x i64> %va, i64 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv1i64_vx:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -368
-; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    mv a3, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a3, a3, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a3, a3, a2
-; RV32V-NEXT:    slli a2, a2, 3
-; RV32V-NEXT:    add a2, a2, a3
-; RV32V-NEXT:    sub sp, sp, a2
-; RV32V-NEXT:    sw a0, 16(sp)
-; RV32V-NEXT:    sw a1, 20(sp)
-; RV32V-NEXT:    addi s6, sp, 16
-; RV32V-NEXT:    lui s7, 1044480
-; RV32V-NEXT:    li s11, 1
-; RV32V-NEXT:    li s8, 2
-; RV32V-NEXT:    li s2, 4
-; RV32V-NEXT:    li s10, 8
-; RV32V-NEXT:    li s5, 32
-; RV32V-NEXT:    li s4, 64
-; RV32V-NEXT:    li s3, 128
-; RV32V-NEXT:    li ra, 256
-; RV32V-NEXT:    li s1, 512
-; RV32V-NEXT:    li s0, 1024
-; RV32V-NEXT:    lui t6, 1
-; RV32V-NEXT:    lui t5, 2
-; RV32V-NEXT:    lui t4, 4
-; RV32V-NEXT:    lui t3, 8
-; RV32V-NEXT:    lui t2, 16
-; RV32V-NEXT:    lui t1, 32
-; RV32V-NEXT:    lui t0, 64
-; RV32V-NEXT:    lui a7, 128
-; RV32V-NEXT:    lui a6, 256
-; RV32V-NEXT:    lui a5, 512
-; RV32V-NEXT:    lui a4, 1024
-; RV32V-NEXT:    lui a3, 2048
-; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    lui a1, 8192
-; RV32V-NEXT:    vsetvli s9, zero, e64, m1, ta, ma
-; RV32V-NEXT:    vlse64.v v13, (s6), zero
-; RV32V-NEXT:    lui s6, 16384
-; RV32V-NEXT:    sw s7, 288(sp)
-; RV32V-NEXT:    lui s7, 32768
-; RV32V-NEXT:    sw zero, 292(sp)
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    sw a0, 280(sp)
-; RV32V-NEXT:    sw zero, 284(sp)
-; RV32V-NEXT:    sw zero, 272(sp)
-; RV32V-NEXT:    sw s11, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s8, 268(sp)
-; RV32V-NEXT:    lui s8, 65536
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s2, 260(sp)
-; RV32V-NEXT:    lui s9, 131072
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s10, 252(sp)
-; RV32V-NEXT:    lui s10, 262144
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    li s2, 16
-; RV32V-NEXT:    sw s2, 244(sp)
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s5, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s4, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s3, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw ra, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s1, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s0, 196(sp)
-; RV32V-NEXT:    slli s11, s11, 11
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s11, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw t6, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw t5, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw t4, 164(sp)
-; RV32V-NEXT:    lui s2, 4
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t3, 156(sp)
-; RV32V-NEXT:    lui s4, 8
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t2, 148(sp)
-; RV32V-NEXT:    lui s3, 16
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t1, 140(sp)
-; RV32V-NEXT:    lui t2, 32
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t0, 132(sp)
-; RV32V-NEXT:    lui t3, 64
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw a7, 124(sp)
-; RV32V-NEXT:    lui t4, 128
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw a6, 116(sp)
-; RV32V-NEXT:    lui t5, 256
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw a5, 108(sp)
-; RV32V-NEXT:    lui t6, 512
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw a4, 100(sp)
-; RV32V-NEXT:    lui s0, 1024
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw a3, 92(sp)
-; RV32V-NEXT:    lui a7, 2048
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a2, 84(sp)
-; RV32V-NEXT:    lui s1, 4096
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a1, 76(sp)
-; RV32V-NEXT:    lui t0, 8192
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw s6, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s7, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw s8, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s9, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw s10, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    lui a0, 61681
-; RV32V-NEXT:    addi a0, a0, -241
-; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vmv.v.x v1, a0
-; RV32V-NEXT:    lui a0, 209715
-; RV32V-NEXT:    addi a0, a0, 819
-; RV32V-NEXT:    vmv.v.x v0, a0
-; RV32V-NEXT:    lui a0, 349525
-; RV32V-NEXT:    addi a0, a0, 1365
-; RV32V-NEXT:    vmv.v.x v9, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32V-NEXT:    vlse64.v v3, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 280
-; RV32V-NEXT:    vlse64.v v14, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 272
-; RV32V-NEXT:    vlse64.v v15, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 264
-; RV32V-NEXT:    vlse64.v v16, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 256
-; RV32V-NEXT:    vlse64.v v17, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 248
-; RV32V-NEXT:    vlse64.v v18, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 240
-; RV32V-NEXT:    vlse64.v v19, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 232
-; RV32V-NEXT:    vlse64.v v20, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 224
-; RV32V-NEXT:    vlse64.v v21, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 216
-; RV32V-NEXT:    vlse64.v v22, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 208
-; RV32V-NEXT:    vlse64.v v23, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 200
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 192
-; RV32V-NEXT:    vlse64.v v25, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 184
-; RV32V-NEXT:    vlse64.v v26, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 176
-; RV32V-NEXT:    vlse64.v v27, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 168
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 160
-; RV32V-NEXT:    vlse64.v v29, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 152
-; RV32V-NEXT:    vlse64.v v30, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 144
-; RV32V-NEXT:    vlse64.v v31, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 136
-; RV32V-NEXT:    vlse64.v v7, (a0), zero
-; RV32V-NEXT:    li a5, 56
-; RV32V-NEXT:    vsrl.vi v9, v8, 24
-; RV32V-NEXT:    vsrl.vi v10, v8, 8
-; RV32V-NEXT:    vsrl.vx v11, v8, a5
-; RV32V-NEXT:    li ra, 40
-; RV32V-NEXT:    vsrl.vx v12, v8, ra
-; RV32V-NEXT:    lui a6, 4080
-; RV32V-NEXT:    vand.vx v9, v9, a6
-; RV32V-NEXT:    vsll.vx v6, v8, a5
-; RV32V-NEXT:    addi a4, s3, -256
-; RV32V-NEXT:    vand.vx v12, v12, a4
-; RV32V-NEXT:    vand.vx v5, v8, a4
-; RV32V-NEXT:    vor.vv v11, v12, v11
-; RV32V-NEXT:    vsll.vx v12, v5, ra
-; RV32V-NEXT:    vor.vv v6, v6, v12
-; RV32V-NEXT:    vsrl.vx v12, v13, a5
-; RV32V-NEXT:    vsrl.vx v5, v13, ra
-; RV32V-NEXT:    vsll.vx v4, v13, a5
-; RV32V-NEXT:    vand.vx v5, v5, a4
-; RV32V-NEXT:    vor.vv v12, v5, v12
-; RV32V-NEXT:    vand.vx v5, v13, a4
-; RV32V-NEXT:    vsll.vx v5, v5, ra
-; RV32V-NEXT:    vor.vv v5, v4, v5
-; RV32V-NEXT:    vsrl.vi v4, v13, 24
-; RV32V-NEXT:    vand.vv v10, v10, v3
-; RV32V-NEXT:    vor.vv v9, v10, v9
-; RV32V-NEXT:    vsrl.vi v10, v13, 8
-; RV32V-NEXT:    vand.vx v4, v4, a6
-; RV32V-NEXT:    vand.vv v10, v10, v3
-; RV32V-NEXT:    vor.vv v4, v10, v4
-; RV32V-NEXT:    addi a3, sp, 128
-; RV32V-NEXT:    vlse64.v v10, (a3), zero
-; RV32V-NEXT:    vor.vv v9, v9, v11
-; RV32V-NEXT:    vand.vx v11, v8, a6
-; RV32V-NEXT:    vsll.vi v11, v11, 24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v3
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vor.vv v8, v11, v8
-; RV32V-NEXT:    addi a3, sp, 120
-; RV32V-NEXT:    vlse64.v v11, (a3), zero
-; RV32V-NEXT:    vor.vv v4, v4, v12
-; RV32V-NEXT:    vand.vx v12, v13, a6
-; RV32V-NEXT:    vsll.vi v12, v12, 24
-; RV32V-NEXT:    vand.vv v13, v13, v3
-; RV32V-NEXT:    vsll.vi v13, v13, 8
-; RV32V-NEXT:    vor.vv v3, v12, v13
-; RV32V-NEXT:    addi a3, sp, 112
-; RV32V-NEXT:    vlse64.v v12, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v6, v8
-; RV32V-NEXT:    addi a3, sp, 104
-; RV32V-NEXT:    vlse64.v v13, (a3), zero
-; RV32V-NEXT:    vor.vv v6, v5, v3
-; RV32V-NEXT:    addi a3, sp, 96
-; RV32V-NEXT:    vlse64.v v3, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v8, v9
-; RV32V-NEXT:    addi a3, sp, 88
-; RV32V-NEXT:    vlse64.v v2, (a3), zero
-; RV32V-NEXT:    vor.vv v9, v6, v4
-; RV32V-NEXT:    vsrl.vi v6, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v1
-; RV32V-NEXT:    vand.vv v6, v6, v1
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v6, v8
-; RV32V-NEXT:    vsrl.vi v6, v9, 4
-; RV32V-NEXT:    vand.vv v9, v9, v1
-; RV32V-NEXT:    vand.vv v6, v6, v1
-; RV32V-NEXT:    vsll.vi v9, v9, 4
-; RV32V-NEXT:    vor.vv v9, v6, v9
-; RV32V-NEXT:    vsrl.vi v6, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v6, v6, v0
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v6, v8
-; RV32V-NEXT:    vsrl.vi v6, v9, 2
-; RV32V-NEXT:    vand.vv v9, v9, v0
-; RV32V-NEXT:    vand.vv v6, v6, v0
-; RV32V-NEXT:    vsll.vi v9, v9, 2
-; RV32V-NEXT:    vor.vv v6, v6, v9
-; RV32V-NEXT:    vsrl.vi v9, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v5
-; RV32V-NEXT:    vand.vv v9, v9, v5
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v9, v9, v8
-; RV32V-NEXT:    vsrl.vi v8, v6, 1
-; RV32V-NEXT:    vand.vv v6, v6, v5
-; RV32V-NEXT:    vand.vv v8, v8, v5
-; RV32V-NEXT:    vadd.vv v6, v6, v6
-; RV32V-NEXT:    vor.vv v8, v8, v6
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    vlse64.v v1, (a3), zero
-; RV32V-NEXT:    vand.vv v14, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v15
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v17
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v19
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v21
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v22
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v23
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v25
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v26
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v27
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v29
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v30
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v31
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v14, v8, v7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v13
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    addi a1, sp, 64
-; RV32V-NEXT:    addi a0, sp, 56
-; RV32V-NEXT:    vlse64.v v10, (a3), zero
-; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vand.vv v13, v8, v1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a2, sp, 48
-; RV32V-NEXT:    addi a1, sp, 40
-; RV32V-NEXT:    addi a3, sp, 32
-; RV32V-NEXT:    addi a0, sp, 24
-; RV32V-NEXT:    vlse64.v v10, (a2), zero
-; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    vlse64.v v12, (a3), zero
-; RV32V-NEXT:    vlse64.v v13, (a0), zero
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v13
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vx v2, v8, a0
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v1, v8, a0
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v0, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vand.vx v13, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v14, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v15, v8, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v16, v8, a0
-; RV32V-NEXT:    vand.vx v17, v8, s11
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vand.vx v18, v8, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vand.vx v19, v8, a0
-; RV32V-NEXT:    vand.vx v20, v8, s2
-; RV32V-NEXT:    vand.vx v21, v8, s4
-; RV32V-NEXT:    vand.vx v22, v8, s3
-; RV32V-NEXT:    vand.vx v23, v8, t2
-; RV32V-NEXT:    vand.vx v24, v8, t3
-; RV32V-NEXT:    vand.vx v25, v8, t4
-; RV32V-NEXT:    vand.vx v26, v8, t5
-; RV32V-NEXT:    vand.vx v27, v8, t6
-; RV32V-NEXT:    vand.vx v28, v8, s0
-; RV32V-NEXT:    vand.vx v29, v8, a7
-; RV32V-NEXT:    vand.vx v30, v8, s1
-; RV32V-NEXT:    vand.vx v31, v8, t0
-; RV32V-NEXT:    vand.vx v7, v8, s6
-; RV32V-NEXT:    vand.vx v6, v8, s7
-; RV32V-NEXT:    vand.vx v5, v8, s8
-; RV32V-NEXT:    vand.vx v4, v8, s9
-; RV32V-NEXT:    vand.vx v3, v8, s10
-; RV32V-NEXT:    vand.vi v10, v8, 2
-; RV32V-NEXT:    vand.vi v11, v8, 1
-; RV32V-NEXT:    vand.vi v12, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v2, v9, v2
-; RV32V-NEXT:    vmul.vv v1, v9, v1
-; RV32V-NEXT:    vmul.vv v0, v9, v0
-; RV32V-NEXT:    vmul.vv v13, v9, v13
-; RV32V-NEXT:    vmul.vv v14, v9, v14
-; RV32V-NEXT:    vmul.vv v15, v9, v15
-; RV32V-NEXT:    vmul.vv v16, v9, v16
-; RV32V-NEXT:    vmul.vv v17, v9, v17
-; RV32V-NEXT:    vmul.vv v18, v9, v18
-; RV32V-NEXT:    vmul.vv v19, v9, v19
-; RV32V-NEXT:    vmul.vv v20, v9, v20
-; RV32V-NEXT:    vmul.vv v21, v9, v21
-; RV32V-NEXT:    vmul.vv v22, v9, v22
-; RV32V-NEXT:    vmul.vv v23, v9, v23
-; RV32V-NEXT:    vmul.vv v24, v9, v24
-; RV32V-NEXT:    vmul.vv v25, v9, v25
-; RV32V-NEXT:    vmul.vv v26, v9, v26
-; RV32V-NEXT:    vmul.vv v27, v9, v27
-; RV32V-NEXT:    vmul.vv v28, v9, v28
-; RV32V-NEXT:    vmul.vv v29, v9, v29
-; RV32V-NEXT:    vmul.vv v30, v9, v30
-; RV32V-NEXT:    vmul.vv v31, v9, v31
-; RV32V-NEXT:    vmul.vv v7, v9, v7
-; RV32V-NEXT:    vmul.vv v6, v9, v6
-; RV32V-NEXT:    vmul.vv v5, v9, v5
-; RV32V-NEXT:    vmul.vv v4, v9, v4
-; RV32V-NEXT:    vmul.vv v3, v9, v3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v9, v8
-; RV32V-NEXT:    vxor.vi v11, v11, 0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v11, v11, v8
-; RV32V-NEXT:    vxor.vv v11, v11, v12
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v11, v8
-; RV32V-NEXT:    vxor.vv v8, v8, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v1
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v13
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v15
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v17
-; RV32V-NEXT:    vxor.vv v8, v8, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v19
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v21
-; RV32V-NEXT:    vxor.vv v8, v8, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v23
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v25
-; RV32V-NEXT:    vxor.vv v8, v8, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v27
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v29
-; RV32V-NEXT:    vxor.vv v8, v8, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v31
-; RV32V-NEXT:    vxor.vv v8, v8, v7
-; RV32V-NEXT:    vxor.vv v8, v8, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v5
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v3
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vsrl.vx v9, v8, a5
-; RV32V-NEXT:    vsll.vx v10, v8, a5
-; RV32V-NEXT:    vsrl.vx v11, v8, ra
-; RV32V-NEXT:    vand.vx v12, v8, a4
-; RV32V-NEXT:    vand.vx v11, v11, a4
-; RV32V-NEXT:    vsrl.vi v13, v8, 24
-; RV32V-NEXT:    vand.vx v14, v8, a6
-; RV32V-NEXT:    vand.vx v13, v13, a6
-; RV32V-NEXT:    vsll.vx v12, v12, ra
-; RV32V-NEXT:    vsrl.vi v15, v8, 8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v15, v15, v16
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v11, v15, v13
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vsll.vi v13, v14, 24
-; RV32V-NEXT:    vor.vv v8, v13, v8
-; RV32V-NEXT:    vor.vv v10, v10, v12
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vor.vv v8, v8, v9
-; RV32V-NEXT:    vsrl.vi v9, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 368
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv1i64_vx:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    li a1, 56
-; RV64V-NEXT:    lui t2, 16
-; RV64V-NEXT:    lui a2, 4080
-; RV64V-NEXT:    li t0, 255
-; RV64V-NEXT:    lui a3, 61681
-; RV64V-NEXT:    lui a4, 209715
-; RV64V-NEXT:    lui a5, 349525
-; RV64V-NEXT:    srli a6, a0, 24
-; RV64V-NEXT:    srli a7, a0, 8
-; RV64V-NEXT:    srli t1, a0, 40
-; RV64V-NEXT:    srli t3, a0, 56
-; RV64V-NEXT:    addi a3, a3, -241
-; RV64V-NEXT:    addi a4, a4, 819
-; RV64V-NEXT:    addi t4, a5, 1365
-; RV64V-NEXT:    slli a5, a3, 32
-; RV64V-NEXT:    add a5, a3, a5
-; RV64V-NEXT:    slli a3, a4, 32
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, t4, 32
-; RV64V-NEXT:    add a3, t4, a3
-; RV64V-NEXT:    srliw t4, a0, 24
-; RV64V-NEXT:    slli t0, t0, 24
-; RV64V-NEXT:    and a6, a6, a2
-; RV64V-NEXT:    and a7, a7, t0
-; RV64V-NEXT:    or t5, a7, a6
-; RV64V-NEXT:    addi a6, t2, -256
-; RV64V-NEXT:    and a7, t1, a6
-; RV64V-NEXT:    or t1, a7, t3
-; RV64V-NEXT:    and a7, a0, a2
-; RV64V-NEXT:    slli t4, t4, 32
-; RV64V-NEXT:    slli a7, a7, 24
-; RV64V-NEXT:    or t3, a7, t4
-; RV64V-NEXT:    li a7, 40
-; RV64V-NEXT:    vsetvli t4, zero, e64, m1, ta, ma
-; RV64V-NEXT:    vsrl.vi v10, v8, 24
-; RV64V-NEXT:    vsrl.vi v9, v8, 8
-; RV64V-NEXT:    or t1, t5, t1
-; RV64V-NEXT:    slli t4, a0, 56
-; RV64V-NEXT:    and a0, a0, a6
-; RV64V-NEXT:    slli a0, a0, 40
-; RV64V-NEXT:    or t4, t4, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    or t4, t4, t3
-; RV64V-NEXT:    lui t3, 1
-; RV64V-NEXT:    vsrl.vx v11, v8, a1
-; RV64V-NEXT:    vsrl.vx v12, v8, a7
-; RV64V-NEXT:    vand.vx v10, v10, a2
-; RV64V-NEXT:    vand.vx v13, v8, a2
-; RV64V-NEXT:    vsll.vx v14, v8, a1
-; RV64V-NEXT:    vand.vx v12, v12, a6
-; RV64V-NEXT:    vand.vx v9, v9, t0
-; RV64V-NEXT:    vsll.vi v13, v13, 24
-; RV64V-NEXT:    vor.vv v11, v12, v11
-; RV64V-NEXT:    vand.vx v12, v8, t0
-; RV64V-NEXT:    vand.vx v8, v8, a6
-; RV64V-NEXT:    vor.vv v9, v9, v10
-; RV64V-NEXT:    vsll.vi v10, v12, 8
-; RV64V-NEXT:    vsll.vx v8, v8, a7
-; RV64V-NEXT:    vor.vv v9, v9, v11
-; RV64V-NEXT:    vor.vv v10, v13, v10
-; RV64V-NEXT:    vor.vv v8, v14, v8
-; RV64V-NEXT:    vor.vv v8, v8, v10
-; RV64V-NEXT:    vor.vv v8, v8, v9
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v9, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, a5
-; RV64V-NEXT:    srli t4, t1, 4
-; RV64V-NEXT:    and t1, t1, a5
-; RV64V-NEXT:    vand.vx v9, v9, a5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    and t4, t4, a5
-; RV64V-NEXT:    slli t1, t1, 4
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v9, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, a4
-; RV64V-NEXT:    srli t4, t1, 2
-; RV64V-NEXT:    and t1, t1, a4
-; RV64V-NEXT:    vand.vx v9, v9, a4
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    and t4, t4, a4
-; RV64V-NEXT:    slli t1, t1, 2
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v9, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    srli t4, t1, 1
-; RV64V-NEXT:    and t1, t1, a3
-; RV64V-NEXT:    vand.vx v9, v9, a3
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    and t4, t4, a3
-; RV64V-NEXT:    slli t1, t1, 1
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    andi t4, t1, 2
-; RV64V-NEXT:    vmul.vx v9, v8, t4
-; RV64V-NEXT:    andi t4, t1, 1
-; RV64V-NEXT:    vmul.vx v10, v8, t4
-; RV64V-NEXT:    andi t4, t1, 4
-; RV64V-NEXT:    vmul.vx v11, v8, t4
-; RV64V-NEXT:    andi t4, t1, 8
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    andi t4, t1, 16
-; RV64V-NEXT:    vmul.vx v13, v8, t4
-; RV64V-NEXT:    andi t4, t1, 32
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    andi t4, t1, 64
-; RV64V-NEXT:    vxor.vv v9, v10, v9
-; RV64V-NEXT:    vmul.vx v10, v8, t4
-; RV64V-NEXT:    andi t4, t1, 128
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vmul.vx v11, v8, t4
-; RV64V-NEXT:    andi t4, t1, 256
-; RV64V-NEXT:    vxor.vv v9, v9, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    andi t4, t1, 512
-; RV64V-NEXT:    vxor.vv v9, v9, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t4
-; RV64V-NEXT:    andi t4, t1, 1024
-; RV64V-NEXT:    vxor.vv v9, v9, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    slli t4, a0, 11
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    vmul.vx v10, v8, t4
-; RV64V-NEXT:    lui t4, 2
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v9, v9, v11
-; RV64V-NEXT:    vmul.vx v11, v8, t3
-; RV64V-NEXT:    lui t3, 4
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v9, v12
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t4
-; RV64V-NEXT:    lui t4, 8
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    lui t3, 32
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v12, v10
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    lui t4, 64
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vx v11, v8, t2
-; RV64V-NEXT:    lui t2, 128
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t3
-; RV64V-NEXT:    lui t3, 256
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    lui t4, 512
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v10, v10, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t2
-; RV64V-NEXT:    lui t2, 1024
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vx v11, v8, t3
-; RV64V-NEXT:    lui t3, 2048
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t4
-; RV64V-NEXT:    lui t4, 4096
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t2
-; RV64V-NEXT:    lui t2, 8192
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t3
-; RV64V-NEXT:    lui t3, 16384
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vx v11, v8, t4
-; RV64V-NEXT:    lui t4, 32768
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v10, v10, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t2
-; RV64V-NEXT:    lui t2, 65536
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    lui t3, 131072
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    lui t4, 262144
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v11, v10, v11
-; RV64V-NEXT:    vxor.vv v11, v11, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t2
-; RV64V-NEXT:    slli t2, a0, 32
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vsll.vx v9, v9, a1
-; RV64V-NEXT:    vand.vx v10, v10, a6
-; RV64V-NEXT:    vsll.vx v10, v10, a7
-; RV64V-NEXT:    vor.vv v9, v9, v10
-; RV64V-NEXT:    vmul.vx v10, v8, t3
-; RV64V-NEXT:    slli t3, a0, 33
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v11, v11, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    slli t4, a0, 34
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t2
-; RV64V-NEXT:    slli t2, a0, 35
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v11, v11, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t3
-; RV64V-NEXT:    slli t3, a0, 36
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v11, v10
-; RV64V-NEXT:    vmul.vx v11, v8, t4
-; RV64V-NEXT:    srliw t4, t1, 31
-; RV64V-NEXT:    slli t4, t4, 31
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    slli t4, a0, 37
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t2
-; RV64V-NEXT:    slli t2, a0, 38
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t3
-; RV64V-NEXT:    slli t3, a0, 39
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t4
-; RV64V-NEXT:    slli t4, a0, 40
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vx v11, v8, t2
-; RV64V-NEXT:    slli t2, a0, 41
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    slli t3, a0, 42
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    slli t4, a0, 43
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v10, v10, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t2
-; RV64V-NEXT:    slli t2, a0, 44
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    vmul.vx v11, v8, t3
-; RV64V-NEXT:    slli t3, a0, 45
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    slli t4, a0, 46
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v10, v12
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t2
-; RV64V-NEXT:    slli t2, a0, 47
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v11, v12, v11
-; RV64V-NEXT:    vmul.vx v12, v8, t3
-; RV64V-NEXT:    slli t3, a0, 48
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v11, v11, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    slli t4, a0, 49
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v11, v11, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t2
-; RV64V-NEXT:    slli t2, a0, 50
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t3
-; RV64V-NEXT:    slli t3, a0, 51
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v11, v11, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    slli t4, a0, 52
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v11, v11, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t2
-; RV64V-NEXT:    slli t2, a0, 53
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v11, v12
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    slli t3, a0, 54
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vsrl.vi v13, v10, 8
-; RV64V-NEXT:    vand.vx v13, v13, t0
-; RV64V-NEXT:    vsrl.vi v11, v11, 24
-; RV64V-NEXT:    vand.vx v11, v11, a2
-; RV64V-NEXT:    vor.vv v11, v13, v11
-; RV64V-NEXT:    vmul.vx v13, v8, t4
-; RV64V-NEXT:    slli t4, a0, 55
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t2
-; RV64V-NEXT:    slli t2, a0, 56
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t3
-; RV64V-NEXT:    slli t3, a0, 57
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    slli t4, a0, 58
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vmul.vx v13, v8, t2
-; RV64V-NEXT:    slli t2, a0, 59
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    slli t3, a0, 60
-; RV64V-NEXT:    vand.vx v10, v10, a2
-; RV64V-NEXT:    slli a2, a0, 61
-; RV64V-NEXT:    slli a0, a0, 62
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    and a2, t1, a2
-; RV64V-NEXT:    and a0, t1, a0
-; RV64V-NEXT:    srli t1, t1, 63
-; RV64V-NEXT:    vsll.vi v10, v10, 24
-; RV64V-NEXT:    vxor.vv v13, v12, v13
-; RV64V-NEXT:    vxor.vv v13, v13, v14
-; RV64V-NEXT:    vand.vx v14, v12, t0
-; RV64V-NEXT:    vsll.vi v14, v14, 8
-; RV64V-NEXT:    vor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    vxor.vv v13, v13, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t2
-; RV64V-NEXT:    vor.vv v9, v9, v10
-; RV64V-NEXT:    vmul.vx v10, v8, t3
-; RV64V-NEXT:    vxor.vv v13, v13, v14
-; RV64V-NEXT:    vmul.vx v14, v8, a2
-; RV64V-NEXT:    vxor.vv v10, v13, v10
-; RV64V-NEXT:    vmul.vx v13, v8, a0
-; RV64V-NEXT:    slli t1, t1, 63
-; RV64V-NEXT:    vmul.vx v8, v8, t1
-; RV64V-NEXT:    vsrl.vx v12, v12, a7
-; RV64V-NEXT:    vand.vx v12, v12, a6
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vxor.vv v10, v10, v13
-; RV64V-NEXT:    vxor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vx v8, v8, a1
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vor.vv v8, v11, v8
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, a5
-; RV64V-NEXT:    vand.vx v9, v9, a5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, a4
-; RV64V-NEXT:    vand.vx v9, v9, a4
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    vand.vx v9, v9, a3
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v8, v8, 1
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC-LABEL: clmulh_nxv1i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v9, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
-;
-; RV64ZVBC-LABEL: clmulh_nxv1i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
-  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i128 0
-  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
-  %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
-  %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 1 x i128> %res.ext to <vscale x 1 x i64>
-  ret <vscale x 1 x i64> %res
-}
-
-define <vscale x 2 x i64> @clmulh_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv2i64_vv:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    lui a0, 1044480
-; RV32V-NEXT:    lui s11, 524288
-; RV32V-NEXT:    li s4, 1
-; RV32V-NEXT:    li ra, 2
-; RV32V-NEXT:    li s10, 4
-; RV32V-NEXT:    li s7, 8
-; RV32V-NEXT:    li s9, 16
-; RV32V-NEXT:    li s8, 32
-; RV32V-NEXT:    li s6, 64
-; RV32V-NEXT:    li s5, 128
-; RV32V-NEXT:    li s3, 256
-; RV32V-NEXT:    li s2, 512
-; RV32V-NEXT:    li s1, 1024
-; RV32V-NEXT:    lui s0, 1
-; RV32V-NEXT:    lui t6, 2
-; RV32V-NEXT:    lui t5, 4
-; RV32V-NEXT:    lui t4, 8
-; RV32V-NEXT:    lui t3, 16
-; RV32V-NEXT:    lui t2, 32
-; RV32V-NEXT:    lui t1, 64
-; RV32V-NEXT:    lui t0, 128
-; RV32V-NEXT:    lui a7, 256
-; RV32V-NEXT:    lui a6, 512
-; RV32V-NEXT:    lui a5, 1024
-; RV32V-NEXT:    lui a4, 2048
-; RV32V-NEXT:    lui a3, 4096
-; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    lui a1, 16384
-; RV32V-NEXT:    sw a0, 272(sp)
-; RV32V-NEXT:    lui a0, 32768
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw s11, 264(sp)
-; RV32V-NEXT:    sw zero, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s4, 260(sp)
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw ra, 252(sp)
-; RV32V-NEXT:    lui ra, 65536
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s10, 244(sp)
-; RV32V-NEXT:    lui s10, 131072
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s7, 236(sp)
-; RV32V-NEXT:    lui s7, 262144
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s9, 228(sp)
-; RV32V-NEXT:    li s9, 16
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s8, 220(sp)
-; RV32V-NEXT:    li s8, 32
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s6, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s5, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s3, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s2, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s1, 180(sp)
-; RV32V-NEXT:    slli s4, s4, 11
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s4, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s0, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t6, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t5, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t4, 140(sp)
-; RV32V-NEXT:    lui s5, 8
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t3, 132(sp)
-; RV32V-NEXT:    lui s1, 16
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t2, 124(sp)
-; RV32V-NEXT:    lui t4, 32
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t1, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t0, 108(sp)
-; RV32V-NEXT:    lui t2, 128
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw a7, 100(sp)
-; RV32V-NEXT:    lui s6, 256
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw a6, 92(sp)
-; RV32V-NEXT:    lui t5, 512
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a5, 84(sp)
-; RV32V-NEXT:    lui a7, 1024
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a4, 76(sp)
-; RV32V-NEXT:    lui s0, 2048
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a3, 68(sp)
-; RV32V-NEXT:    lui t6, 4096
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a2, 60(sp)
-; RV32V-NEXT:    lui s2, 8192
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a1, 52(sp)
-; RV32V-NEXT:    lui a4, 16384
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a0, 44(sp)
-; RV32V-NEXT:    lui a6, 32768
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw ra, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw s10, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw s7, 20(sp)
-; RV32V-NEXT:    sw zero, 8(sp)
-; RV32V-NEXT:    sw s11, 12(sp)
-; RV32V-NEXT:    lui a5, 61681
-; RV32V-NEXT:    addi a5, a5, -241
-; RV32V-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vmv.v.x v2, a5
-; RV32V-NEXT:    lui a5, 209715
-; RV32V-NEXT:    addi a5, a5, 819
-; RV32V-NEXT:    vmv.v.x v0, a5
-; RV32V-NEXT:    lui a5, 349525
-; RV32V-NEXT:    addi a5, a5, 1365
-; RV32V-NEXT:    vmv.v.x v12, a5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a5, sp, 272
-; RV32V-NEXT:    vsetvli t0, zero, e64, m2, ta, ma
-; RV32V-NEXT:    vlse64.v v4, (a5), zero
-; RV32V-NEXT:    addi a5, sp, 264
-; RV32V-NEXT:    vlse64.v v12, (a5), zero
-; RV32V-NEXT:    addi a5, sp, 256
-; RV32V-NEXT:    vlse64.v v14, (a5), zero
-; RV32V-NEXT:    addi a5, sp, 248
-; RV32V-NEXT:    vlse64.v v16, (a5), zero
-; RV32V-NEXT:    li t0, 56
-; RV32V-NEXT:    vsrl.vi v18, v8, 24
-; RV32V-NEXT:    vsrl.vi v20, v8, 8
-; RV32V-NEXT:    vsrl.vx v22, v8, t0
-; RV32V-NEXT:    li a5, 40
-; RV32V-NEXT:    vsrl.vx v24, v8, a5
-; RV32V-NEXT:    vsll.vx v26, v8, t0
-; RV32V-NEXT:    vsrl.vx v28, v10, t0
-; RV32V-NEXT:    vsrl.vx v30, v10, a5
-; RV32V-NEXT:    addi t3, s1, -256
-; RV32V-NEXT:    vand.vx v24, v24, t3
-; RV32V-NEXT:    vor.vv v22, v24, v22
-; RV32V-NEXT:    vsll.vx v6, v10, t0
-; RV32V-NEXT:    vand.vx v24, v30, t3
-; RV32V-NEXT:    vor.vv v30, v24, v28
-; RV32V-NEXT:    vand.vx v24, v8, t3
-; RV32V-NEXT:    vsll.vx v24, v24, a5
-; RV32V-NEXT:    vor.vv v24, v26, v24
-; RV32V-NEXT:    vand.vx v26, v10, t3
-; RV32V-NEXT:    vsll.vx v26, v26, a5
-; RV32V-NEXT:    vor.vv v26, v6, v26
-; RV32V-NEXT:    vsrl.vi v28, v10, 24
-; RV32V-NEXT:    lui s3, 4080
-; RV32V-NEXT:    vand.vx v18, v18, s3
-; RV32V-NEXT:    vand.vv v20, v20, v4
-; RV32V-NEXT:    vor.vv v20, v20, v18
-; RV32V-NEXT:    vsrl.vi v18, v10, 8
-; RV32V-NEXT:    vand.vx v28, v28, s3
-; RV32V-NEXT:    vand.vv v18, v18, v4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vor.vv v6, v18, v28
-; RV32V-NEXT:    addi a3, sp, 240
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vor.vv v28, v20, v22
-; RV32V-NEXT:    vand.vx v20, v8, s3
-; RV32V-NEXT:    vsll.vi v20, v20, 24
-; RV32V-NEXT:    vand.vv v8, v8, v4
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vor.vv v8, v20, v8
-; RV32V-NEXT:    addi a3, sp, 232
-; RV32V-NEXT:    vlse64.v v20, (a3), zero
-; RV32V-NEXT:    vor.vv v30, v6, v30
-; RV32V-NEXT:    vand.vx v22, v10, s3
-; RV32V-NEXT:    vsll.vi v22, v22, 24
-; RV32V-NEXT:    vand.vv v10, v10, v4
-; RV32V-NEXT:    vsll.vi v10, v10, 8
-; RV32V-NEXT:    vor.vv v10, v22, v10
-; RV32V-NEXT:    addi a3, sp, 224
-; RV32V-NEXT:    vlse64.v v22, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v24, v8
-; RV32V-NEXT:    addi a3, sp, 216
-; RV32V-NEXT:    vlse64.v v24, (a3), zero
-; RV32V-NEXT:    vor.vv v10, v26, v10
-; RV32V-NEXT:    addi a3, sp, 208
-; RV32V-NEXT:    vlse64.v v26, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v8, v28
-; RV32V-NEXT:    addi a3, sp, 200
-; RV32V-NEXT:    vlse64.v v28, (a3), zero
-; RV32V-NEXT:    vor.vv v10, v10, v30
-; RV32V-NEXT:    vsrl.vi v30, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v2
-; RV32V-NEXT:    vand.vv v30, v30, v2
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v30, v8
-; RV32V-NEXT:    vsrl.vi v30, v10, 4
-; RV32V-NEXT:    vand.vv v10, v10, v2
-; RV32V-NEXT:    vand.vv v30, v30, v2
-; RV32V-NEXT:    vsll.vi v10, v10, 4
-; RV32V-NEXT:    vor.vv v10, v30, v10
-; RV32V-NEXT:    vsrl.vi v30, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v0, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v30, v30, v0
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v30, v8
-; RV32V-NEXT:    vsrl.vi v30, v10, 2
-; RV32V-NEXT:    vand.vv v10, v10, v0
-; RV32V-NEXT:    vand.vv v30, v30, v0
-; RV32V-NEXT:    vsll.vi v10, v10, 2
-; RV32V-NEXT:    vor.vv v30, v30, v10
-; RV32V-NEXT:    vsrl.vi v10, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v6, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v6
-; RV32V-NEXT:    vand.vv v10, v10, v6
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v10, v10, v8
-; RV32V-NEXT:    vsrl.vi v8, v30, 1
-; RV32V-NEXT:    vand.vv v30, v30, v6
-; RV32V-NEXT:    vand.vv v8, v8, v6
-; RV32V-NEXT:    vadd.vv v30, v30, v30
-; RV32V-NEXT:    vor.vv v8, v8, v30
-; RV32V-NEXT:    addi a3, sp, 192
-; RV32V-NEXT:    vlse64.v v30, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v22
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v26
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 184
-; RV32V-NEXT:    addi a1, sp, 176
-; RV32V-NEXT:    addi a0, sp, 168
-; RV32V-NEXT:    vlse64.v v12, (a3), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a0), zero
-; RV32V-NEXT:    vand.vv v18, v8, v30
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v18, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a2, sp, 160
-; RV32V-NEXT:    addi a1, sp, 152
-; RV32V-NEXT:    addi a3, sp, 144
-; RV32V-NEXT:    addi a0, sp, 136
-; RV32V-NEXT:    vlse64.v v12, (a2), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a3), zero
-; RV32V-NEXT:    vlse64.v v18, (a0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 128
-; RV32V-NEXT:    addi a1, sp, 120
-; RV32V-NEXT:    addi a2, sp, 112
-; RV32V-NEXT:    addi a3, sp, 104
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 96
-; RV32V-NEXT:    addi a1, sp, 88
-; RV32V-NEXT:    addi a2, sp, 80
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 64
-; RV32V-NEXT:    addi a1, sp, 56
-; RV32V-NEXT:    addi a2, sp, 48
-; RV32V-NEXT:    addi a3, sp, 40
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 32
-; RV32V-NEXT:    addi a1, sp, 24
-; RV32V-NEXT:    addi a2, sp, 16
-; RV32V-NEXT:    addi a3, sp, 8
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v18, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vand.vx v20, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v22, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v26, v8, a0
-; RV32V-NEXT:    vand.vx v28, v8, s4
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vand.vx v30, v8, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vand.vx v6, v8, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vand.vx v4, v8, a0
-; RV32V-NEXT:    vand.vx v2, v8, s5
-; RV32V-NEXT:    vand.vx v0, v8, s1
-; RV32V-NEXT:    vand.vx v12, v8, t4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, a7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, a4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, a6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, ra
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s7
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vi v12, v8, 2
-; RV32V-NEXT:    vand.vi v14, v8, 1
-; RV32V-NEXT:    vand.vi v16, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v12, v10, v14
-; RV32V-NEXT:    vmul.vv v14, v10, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v14, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v10, v8
-; RV32V-NEXT:    vmul.vv v18, v10, v18
-; RV32V-NEXT:    vmul.vv v20, v10, v20
-; RV32V-NEXT:    vmul.vv v22, v10, v22
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vmul.vv v26, v10, v26
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vmul.vv v30, v10, v30
-; RV32V-NEXT:    vmul.vv v6, v10, v6
-; RV32V-NEXT:    vmul.vv v4, v10, v4
-; RV32V-NEXT:    vmul.vv v2, v10, v2
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v10, v10, v8
-; RV32V-NEXT:    vxor.vi v8, v12, 0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    vsrl.vx v10, v8, t0
-; RV32V-NEXT:    vsll.vx v12, v8, t0
-; RV32V-NEXT:    vsrl.vx v14, v8, a5
-; RV32V-NEXT:    vand.vx v16, v8, t3
-; RV32V-NEXT:    vand.vx v14, v14, t3
-; RV32V-NEXT:    vsrl.vi v18, v8, 24
-; RV32V-NEXT:    vand.vx v20, v8, s3
-; RV32V-NEXT:    vand.vx v18, v18, s3
-; RV32V-NEXT:    vsll.vx v16, v16, a5
-; RV32V-NEXT:    vsrl.vi v22, v8, 8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v24, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v22, v22, v24
-; RV32V-NEXT:    vor.vv v10, v14, v10
-; RV32V-NEXT:    vor.vv v14, v22, v18
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vsll.vi v18, v20, 24
-; RV32V-NEXT:    vor.vv v8, v18, v8
-; RV32V-NEXT:    vor.vv v12, v12, v16
-; RV32V-NEXT:    vor.vv v10, v14, v10
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vor.vv v8, v8, v10
-; RV32V-NEXT:    vsrl.vi v10, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v12
-; RV32V-NEXT:    vand.vv v10, v10, v12
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vsrl.vi v10, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v12
-; RV32V-NEXT:    vand.vv v10, v10, v12
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vsrl.vi v10, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v12
-; RV32V-NEXT:    vand.vv v10, v10, v12
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv2i64_vv:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi sp, sp, -320
-; RV64V-NEXT:    sd ra, 312(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s0, 304(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s1, 296(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s2, 288(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s3, 280(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s4, 272(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s5, 264(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s6, 256(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s7, 248(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s8, 240(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s9, 232(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s10, 224(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s11, 216(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    sub sp, sp, a0
-; RV64V-NEXT:    lui a1, 16
-; RV64V-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64V-NEXT:    vsrl.vi v16, v8, 24
-; RV64V-NEXT:    vsrl.vi v12, v8, 8
-; RV64V-NEXT:    li a4, 255
-; RV64V-NEXT:    lui a0, 61681
-; RV64V-NEXT:    lui a2, 209715
-; RV64V-NEXT:    lui a3, 349525
-; RV64V-NEXT:    vsrl.vi v18, v10, 24
-; RV64V-NEXT:    vsrl.vi v14, v10, 8
-; RV64V-NEXT:    li a5, 16
-; RV64V-NEXT:    li a6, 32
-; RV64V-NEXT:    li a7, 64
-; RV64V-NEXT:    li t0, 128
-; RV64V-NEXT:    li t2, 256
-; RV64V-NEXT:    li t3, 512
-; RV64V-NEXT:    li t4, 1024
-; RV64V-NEXT:    li t1, 1
-; RV64V-NEXT:    lui s4, 1
-; RV64V-NEXT:    lui s3, 2
-; RV64V-NEXT:    lui s5, 4
-; RV64V-NEXT:    lui t5, 8
-; RV64V-NEXT:    lui s0, 32
-; RV64V-NEXT:    li s6, 56
-; RV64V-NEXT:    vsrl.vx v30, v8, s6
-; RV64V-NEXT:    li s7, 40
-; RV64V-NEXT:    vsrl.vx v0, v8, s7
-; RV64V-NEXT:    addi s9, a1, -256
-; RV64V-NEXT:    lui s8, 4080
-; RV64V-NEXT:    vand.vx v2, v16, s8
-; RV64V-NEXT:    slli a4, a4, 24
-; RV64V-NEXT:    vand.vx v4, v8, s8
-; RV64V-NEXT:    vsll.vx v6, v8, s6
-; RV64V-NEXT:    addi t6, a0, -241
-; RV64V-NEXT:    addi s1, a2, 819
-; RV64V-NEXT:    addi s2, a3, 1365
-; RV64V-NEXT:    vsrl.vx v20, v10, s6
-; RV64V-NEXT:    vsrl.vx v22, v10, s7
-; RV64V-NEXT:    vand.vx v18, v18, s8
-; RV64V-NEXT:    vand.vx v24, v10, s8
-; RV64V-NEXT:    vsll.vx v16, v10, s6
-; RV64V-NEXT:    slli s11, t1, 11
-; RV64V-NEXT:    slli a0, t1, 31
-; RV64V-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t1, 32
-; RV64V-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t1, 33
-; RV64V-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t1, 34
-; RV64V-NEXT:    sd a0, 128(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t6, 32
-; RV64V-NEXT:    add t6, t6, a0
-; RV64V-NEXT:    slli a0, s1, 32
-; RV64V-NEXT:    add s1, s1, a0
-; RV64V-NEXT:    slli a0, s2, 32
-; RV64V-NEXT:    add s2, s2, a0
-; RV64V-NEXT:    slli a0, t1, 35
-; RV64V-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v22, v22, s9
-; RV64V-NEXT:    vand.vx v14, v14, a4
-; RV64V-NEXT:    vsll.vi v24, v24, 24
-; RV64V-NEXT:    vand.vx v26, v10, a4
-; RV64V-NEXT:    vand.vx v10, v10, s9
-; RV64V-NEXT:    vor.vv v20, v22, v20
-; RV64V-NEXT:    vor.vv v14, v14, v18
-; RV64V-NEXT:    vsll.vi v18, v26, 8
-; RV64V-NEXT:    li a0, 40
-; RV64V-NEXT:    vsll.vx v10, v10, a0
-; RV64V-NEXT:    vor.vv v14, v14, v20
-; RV64V-NEXT:    vor.vv v18, v24, v18
-; RV64V-NEXT:    vor.vv v10, v16, v10
-; RV64V-NEXT:    vor.vv v10, v10, v18
-; RV64V-NEXT:    vor.vv v10, v10, v14
-; RV64V-NEXT:    vsrl.vi v14, v10, 4
-; RV64V-NEXT:    vand.vx v10, v10, t6
-; RV64V-NEXT:    vand.vx v14, v14, t6
-; RV64V-NEXT:    vsll.vi v10, v10, 4
-; RV64V-NEXT:    vor.vv v10, v14, v10
-; RV64V-NEXT:    vsrl.vi v14, v10, 2
-; RV64V-NEXT:    vand.vx v10, v10, s1
-; RV64V-NEXT:    vand.vx v14, v14, s1
-; RV64V-NEXT:    vsll.vi v10, v10, 2
-; RV64V-NEXT:    vor.vv v10, v14, v10
-; RV64V-NEXT:    vsrl.vi v14, v10, 1
-; RV64V-NEXT:    vand.vx v10, v10, s2
-; RV64V-NEXT:    vand.vx v14, v14, s2
-; RV64V-NEXT:    vadd.vv v10, v10, v10
-; RV64V-NEXT:    vor.vv v14, v14, v10
-; RV64V-NEXT:    vand.vx v20, v14, a5
-; RV64V-NEXT:    slli a2, t1, 36
-; RV64V-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v22, v14, a6
-; RV64V-NEXT:    slli a2, t1, 37
-; RV64V-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v24, v14, a7
-; RV64V-NEXT:    slli a2, t1, 38
-; RV64V-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v26, v14, t0
-; RV64V-NEXT:    slli a2, t1, 39
-; RV64V-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v28, v14, t2
-; RV64V-NEXT:    slli a2, t1, 40
-; RV64V-NEXT:    sd a2, 80(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t3
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 7
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 208
-; RV64V-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    slli a2, t1, 41
-; RV64V-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t4
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 208
-; RV64V-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    slli a2, t1, 42
-; RV64V-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 208
-; RV64V-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    slli a2, t1, 43
-; RV64V-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s9, 160(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v0, s9
-; RV64V-NEXT:    vor.vv v10, v10, v30
-; RV64V-NEXT:    vand.vx v0, v14, s4
-; RV64V-NEXT:    slli a2, t1, 44
-; RV64V-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd a4, 168(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v12, a4
-; RV64V-NEXT:    vor.vv v12, v12, v2
-; RV64V-NEXT:    vand.vx v2, v14, s3
-; RV64V-NEXT:    slli a2, t1, 45
-; RV64V-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vsll.vi v16, v4, 24
-; RV64V-NEXT:    vor.vv v10, v12, v10
-; RV64V-NEXT:    vand.vx v12, v8, a4
-; RV64V-NEXT:    vsll.vi v12, v12, 8
-; RV64V-NEXT:    vor.vv v12, v16, v12
-; RV64V-NEXT:    vand.vx v16, v14, s5
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 208
-; RV64V-NEXT:    vs2r.v v16, (a2) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    slli a2, t1, 46
-; RV64V-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, s9
-; RV64V-NEXT:    vsll.vx v8, v8, a0
-; RV64V-NEXT:    vor.vv v8, v6, v8
-; RV64V-NEXT:    vand.vx v16, v14, t5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a2, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a2, a2, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a2, a2, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v16, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    slli a0, t1, 47
-; RV64V-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vor.vv v8, v8, v12
-; RV64V-NEXT:    vand.vx v12, v14, a1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    slli a0, t1, 48
-; RV64V-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vor.vv v8, v8, v10
-; RV64V-NEXT:    vsrl.vi v10, v8, 4
-; RV64V-NEXT:    sd t6, 176(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, t6
-; RV64V-NEXT:    vand.vx v10, v10, t6
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 2
-; RV64V-NEXT:    sd s1, 184(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, s1
-; RV64V-NEXT:    vand.vx v10, v10, s1
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 1
-; RV64V-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, s2
-; RV64V-NEXT:    vand.vx v10, v10, s2
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vand.vx v10, v14, s0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    slli ra, t1, 49
-; RV64V-NEXT:    slli s11, t1, 50
-; RV64V-NEXT:    slli s10, t1, 51
-; RV64V-NEXT:    slli s9, t1, 52
-; RV64V-NEXT:    slli s8, t1, 53
-; RV64V-NEXT:    slli s7, t1, 54
-; RV64V-NEXT:    slli s6, t1, 55
-; RV64V-NEXT:    slli s5, t1, 56
-; RV64V-NEXT:    slli s4, t1, 57
-; RV64V-NEXT:    slli s3, t1, 58
-; RV64V-NEXT:    slli s2, t1, 59
-; RV64V-NEXT:    slli t6, t1, 60
-; RV64V-NEXT:    slli s1, t1, 61
-; RV64V-NEXT:    slli s0, t1, 62
-; RV64V-NEXT:    li a0, -1
-; RV64V-NEXT:    slli t5, a0, 63
-; RV64V-NEXT:    lui a0, 64
-; RV64V-NEXT:    lui a1, 128
-; RV64V-NEXT:    lui a2, 256
-; RV64V-NEXT:    lui a3, 512
-; RV64V-NEXT:    lui a4, 1024
-; RV64V-NEXT:    lui a5, 2048
-; RV64V-NEXT:    lui a6, 4096
-; RV64V-NEXT:    lui a7, 8192
-; RV64V-NEXT:    lui t0, 16384
-; RV64V-NEXT:    lui t1, 32768
-; RV64V-NEXT:    lui t2, 65536
-; RV64V-NEXT:    lui t3, 131072
-; RV64V-NEXT:    lui t4, 262144
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    sd s6, 8(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    mv s6, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add s6, s6, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add s6, s6, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, s6
-; RV64V-NEXT:    ld s6, 8(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, a1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, a2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, a3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, a4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, a5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, a6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, a7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v10, v14, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, ra
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s11
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s9
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, s2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v14, t6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vi v10, v14, 2
-; RV64V-NEXT:    vand.vi v12, v14, 1
-; RV64V-NEXT:    vand.vi v16, v14, 4
-; RV64V-NEXT:    vand.vi v18, v14, 8
-; RV64V-NEXT:    vand.vx v30, v14, s1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v30, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v30, v14, s0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v30, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vand.vx v14, v14, t5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v10, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v12, v8, v16
-; RV64V-NEXT:    vmul.vv v14, v8, v18
-; RV64V-NEXT:    vmul.vv v16, v8, v20
-; RV64V-NEXT:    vmul.vv v18, v8, v22
-; RV64V-NEXT:    vmul.vv v20, v8, v24
-; RV64V-NEXT:    vmul.vv v22, v8, v26
-; RV64V-NEXT:    vmul.vv v24, v8, v28
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v26, v8, v26
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v28, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v28, v8, v28
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v30, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v30, v8, v30
-; RV64V-NEXT:    vmul.vv v6, v8, v0
-; RV64V-NEXT:    vmul.vv v4, v8, v2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v2, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v2, v8, v2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v0, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v0, v8, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    addi a0, sp, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v10, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v8, v8, v10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v8, v10
-; RV64V-NEXT:    vxor.vv v10, v10, v12
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vxor.vv v10, v10, v16
-; RV64V-NEXT:    vxor.vv v10, v10, v18
-; RV64V-NEXT:    vxor.vv v10, v10, v20
-; RV64V-NEXT:    vxor.vv v10, v10, v22
-; RV64V-NEXT:    vxor.vv v12, v10, v24
-; RV64V-NEXT:    vxor.vv v12, v12, v26
-; RV64V-NEXT:    vxor.vv v12, v12, v28
-; RV64V-NEXT:    vxor.vv v12, v12, v30
-; RV64V-NEXT:    vxor.vv v12, v12, v6
-; RV64V-NEXT:    vxor.vv v12, v12, v4
-; RV64V-NEXT:    vxor.vv v12, v12, v2
-; RV64V-NEXT:    vxor.vv v12, v12, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v12, v8
-; RV64V-NEXT:    addi a0, sp, 208
-; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v8
-; RV64V-NEXT:    li a0, 56
-; RV64V-NEXT:    vsll.vx v10, v10, a0
-; RV64V-NEXT:    ld a2, 160(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v12, a2
-; RV64V-NEXT:    li a4, 40
-; RV64V-NEXT:    vsll.vx v12, v12, a4
-; RV64V-NEXT:    vor.vv v10, v10, v12
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v12, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v14, v12
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 5
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v12, v14
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 5
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 6
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v14, v16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    vsrl.vi v18, v12, 8
-; RV64V-NEXT:    ld a3, 168(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v18, v18, a3
-; RV64V-NEXT:    vsrl.vi v14, v14, 24
-; RV64V-NEXT:    lui a1, 4080
-; RV64V-NEXT:    vand.vx v14, v14, a1
-; RV64V-NEXT:    vor.vv v14, v18, v14
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 208
-; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 208
-; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 208
-; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 208
-; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 208
-; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    vand.vx v12, v12, a1
-; RV64V-NEXT:    vsll.vi v12, v12, 24
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a5, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a5, a5, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a5, a5, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a5, a5, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a5
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v18, v16, v18
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a5, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a5, a5, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a5, a5, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a5
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v20, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v18, v18, v20
-; RV64V-NEXT:    vand.vx v20, v16, a3
-; RV64V-NEXT:    vsll.vi v20, v20, 8
-; RV64V-NEXT:    vor.vv v12, v12, v20
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v20, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v18, v18, v20
-; RV64V-NEXT:    vor.vv v10, v10, v12
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a3, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a3, a3, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a3
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v12, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v18, v12
-; RV64V-NEXT:    vsrl.vx v16, v16, a4
-; RV64V-NEXT:    vand.vx v16, v16, a2
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 7
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 208
-; RV64V-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vx v8, v8, a0
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vor.vv v8, v14, v8
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 4
-; RV64V-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v10, v10, a0
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 2
-; RV64V-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v10, v10, a0
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 1
-; RV64V-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v10, v10, a0
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v8, v8, 1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add sp, sp, a0
-; RV64V-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s0, 304(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s1, 296(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s2, 288(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s3, 280(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s4, 272(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s5, 264(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s6, 256(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s7, 248(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s8, 240(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s9, 232(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s10, 224(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s11, 216(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    addi sp, sp, 320
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC-LABEL: clmulh_nxv2i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v10
-; RV32ZVBC-NEXT:    ret
-;
-; RV64ZVBC-LABEL: clmulh_nxv2i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v10
-; RV64ZVBC-NEXT:    ret
-  %va.ext = zext <vscale x 2 x i64> %va to <vscale x 2 x i128>
-  %vb.ext = zext <vscale x 2 x i64> %vb to <vscale x 2 x i128>
-  %clmul = call <vscale x 2 x i128> @llvm.clmul.nxv2i128(<vscale x 2 x i128> %va.ext, <vscale x 2 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 2 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 2 x i128> %res.ext to <vscale x 2 x i64>
-  ret <vscale x 2 x i64> %res
-}
-
-define <vscale x 2 x i64> @clmulh_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv2i64_vx:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -368
-; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 4
-; RV32V-NEXT:    mv a3, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a3, a3, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a2, a2, a3
-; RV32V-NEXT:    sub sp, sp, a2
-; RV32V-NEXT:    sw a0, 16(sp)
-; RV32V-NEXT:    sw a1, 20(sp)
-; RV32V-NEXT:    addi s11, sp, 16
-; RV32V-NEXT:    lui s2, 1044480
-; RV32V-NEXT:    li s6, 1
-; RV32V-NEXT:    li ra, 2
-; RV32V-NEXT:    li s3, 4
-; RV32V-NEXT:    li s7, 8
-; RV32V-NEXT:    li s9, 32
-; RV32V-NEXT:    li s8, 64
-; RV32V-NEXT:    li s5, 128
-; RV32V-NEXT:    li s4, 256
-; RV32V-NEXT:    li s1, 512
-; RV32V-NEXT:    li s0, 1024
-; RV32V-NEXT:    lui t6, 1
-; RV32V-NEXT:    lui t5, 2
-; RV32V-NEXT:    lui t3, 4
-; RV32V-NEXT:    lui t2, 8
-; RV32V-NEXT:    lui t1, 16
-; RV32V-NEXT:    lui t0, 32
-; RV32V-NEXT:    lui t4, 64
-; RV32V-NEXT:    lui a7, 128
-; RV32V-NEXT:    lui a6, 256
-; RV32V-NEXT:    lui a5, 512
-; RV32V-NEXT:    lui a4, 1024
-; RV32V-NEXT:    lui a3, 2048
-; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    lui a1, 8192
-; RV32V-NEXT:    vsetvli s10, zero, e64, m2, ta, ma
-; RV32V-NEXT:    vlse64.v v18, (s11), zero
-; RV32V-NEXT:    lui s10, 16384
-; RV32V-NEXT:    sw s2, 288(sp)
-; RV32V-NEXT:    lui s11, 32768
-; RV32V-NEXT:    sw zero, 292(sp)
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    sw a0, 280(sp)
-; RV32V-NEXT:    sw zero, 284(sp)
-; RV32V-NEXT:    sw zero, 272(sp)
-; RV32V-NEXT:    sw s6, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw ra, 268(sp)
-; RV32V-NEXT:    lui ra, 65536
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s3, 260(sp)
-; RV32V-NEXT:    lui s2, 131072
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s7, 252(sp)
-; RV32V-NEXT:    lui s3, 262144
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    li s7, 16
-; RV32V-NEXT:    sw s7, 244(sp)
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s9, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s8, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s5, 220(sp)
-; RV32V-NEXT:    li s5, 128
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s4, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s1, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s0, 196(sp)
-; RV32V-NEXT:    slli s6, s6, 11
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s6, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw t6, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw t5, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw t3, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t2, 156(sp)
-; RV32V-NEXT:    lui t6, 8
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t1, 148(sp)
-; RV32V-NEXT:    lui s8, 16
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t0, 140(sp)
-; RV32V-NEXT:    lui s0, 32
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t4, 132(sp)
-; RV32V-NEXT:    lui t2, 64
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw a7, 124(sp)
-; RV32V-NEXT:    lui t3, 128
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw a6, 116(sp)
-; RV32V-NEXT:    lui s1, 256
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw a5, 108(sp)
-; RV32V-NEXT:    lui s7, 512
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw a4, 100(sp)
-; RV32V-NEXT:    lui a5, 1024
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw a3, 92(sp)
-; RV32V-NEXT:    lui s4, 2048
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a2, 84(sp)
-; RV32V-NEXT:    lui t5, 4096
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a1, 76(sp)
-; RV32V-NEXT:    lui a4, 8192
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw s10, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s11, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw ra, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s2, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw s3, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    lui a6, 61681
-; RV32V-NEXT:    addi a6, a6, -241
-; RV32V-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
-; RV32V-NEXT:    vmv.v.x v4, a6
-; RV32V-NEXT:    lui a6, 209715
-; RV32V-NEXT:    addi a6, a6, 819
-; RV32V-NEXT:    vmv.v.x v2, a6
-; RV32V-NEXT:    lui a6, 349525
-; RV32V-NEXT:    addi a6, a6, 1365
-; RV32V-NEXT:    vmv.v.x v0, a6
-; RV32V-NEXT:    addi a6, sp, 288
-; RV32V-NEXT:    vsetvli t0, zero, e64, m2, ta, ma
-; RV32V-NEXT:    vlse64.v v6, (a6), zero
-; RV32V-NEXT:    addi a6, sp, 280
-; RV32V-NEXT:    vlse64.v v12, (a6), zero
-; RV32V-NEXT:    addi a6, sp, 272
-; RV32V-NEXT:    vlse64.v v14, (a6), zero
-; RV32V-NEXT:    addi a6, sp, 264
-; RV32V-NEXT:    vlse64.v v16, (a6), zero
-; RV32V-NEXT:    li t0, 56
-; RV32V-NEXT:    vsrl.vi v10, v8, 24
-; RV32V-NEXT:    vsrl.vi v20, v8, 8
-; RV32V-NEXT:    vsrl.vx v22, v8, t0
-; RV32V-NEXT:    li a6, 40
-; RV32V-NEXT:    vsrl.vx v24, v8, a6
-; RV32V-NEXT:    lui t1, 4080
-; RV32V-NEXT:    vand.vx v28, v10, t1
-; RV32V-NEXT:    vsll.vx v10, v8, t0
-; RV32V-NEXT:    addi t4, s8, -256
-; RV32V-NEXT:    vand.vx v24, v24, t4
-; RV32V-NEXT:    vand.vx v26, v8, t4
-; RV32V-NEXT:    vor.vv v22, v24, v22
-; RV32V-NEXT:    vsll.vx v24, v26, a6
-; RV32V-NEXT:    vor.vv v10, v10, v24
-; RV32V-NEXT:    vsrl.vx v24, v18, t0
-; RV32V-NEXT:    vsrl.vx v26, v18, a6
-; RV32V-NEXT:    vsll.vx v30, v18, t0
-; RV32V-NEXT:    vand.vx v26, v26, t4
-; RV32V-NEXT:    vor.vv v24, v26, v24
-; RV32V-NEXT:    vand.vx v26, v18, t4
-; RV32V-NEXT:    vsll.vx v26, v26, a6
-; RV32V-NEXT:    vor.vv v26, v30, v26
-; RV32V-NEXT:    vsrl.vi v30, v18, 24
-; RV32V-NEXT:    vand.vv v20, v20, v6
-; RV32V-NEXT:    vor.vv v28, v20, v28
-; RV32V-NEXT:    vsrl.vi v20, v18, 8
-; RV32V-NEXT:    vand.vx v30, v30, t1
-; RV32V-NEXT:    vand.vv v20, v20, v6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v6, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vor.vv v30, v20, v30
-; RV32V-NEXT:    addi a3, sp, 256
-; RV32V-NEXT:    vlse64.v v20, (a3), zero
-; RV32V-NEXT:    vor.vv v28, v28, v22
-; RV32V-NEXT:    vand.vx v22, v8, t1
-; RV32V-NEXT:    vsll.vi v22, v22, 24
-; RV32V-NEXT:    vand.vv v8, v8, v6
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vor.vv v8, v22, v8
-; RV32V-NEXT:    addi a3, sp, 248
-; RV32V-NEXT:    vlse64.v v22, (a3), zero
-; RV32V-NEXT:    vor.vv v30, v30, v24
-; RV32V-NEXT:    vand.vx v24, v18, t1
-; RV32V-NEXT:    vsll.vi v24, v24, 24
-; RV32V-NEXT:    vand.vv v18, v18, v6
-; RV32V-NEXT:    vsll.vi v18, v18, 8
-; RV32V-NEXT:    vor.vv v6, v24, v18
-; RV32V-NEXT:    addi a3, sp, 240
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    addi a3, sp, 232
-; RV32V-NEXT:    vlse64.v v24, (a3), zero
-; RV32V-NEXT:    vor.vv v10, v26, v6
-; RV32V-NEXT:    addi a3, sp, 224
-; RV32V-NEXT:    vlse64.v v26, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v8, v28
-; RV32V-NEXT:    addi a3, sp, 216
-; RV32V-NEXT:    vlse64.v v28, (a3), zero
-; RV32V-NEXT:    vor.vv v10, v10, v30
-; RV32V-NEXT:    vsrl.vi v30, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v4
-; RV32V-NEXT:    vand.vv v30, v30, v4
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v30, v8
-; RV32V-NEXT:    vsrl.vi v30, v10, 4
-; RV32V-NEXT:    vand.vv v10, v10, v4
-; RV32V-NEXT:    vand.vv v30, v30, v4
-; RV32V-NEXT:    vsll.vi v10, v10, 4
-; RV32V-NEXT:    vor.vv v10, v30, v10
-; RV32V-NEXT:    vsrl.vi v30, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v2
-; RV32V-NEXT:    vand.vv v30, v30, v2
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v30, v8
-; RV32V-NEXT:    vsrl.vi v30, v10, 2
-; RV32V-NEXT:    vand.vv v10, v10, v2
-; RV32V-NEXT:    vand.vv v30, v30, v2
-; RV32V-NEXT:    vsll.vi v10, v10, 2
-; RV32V-NEXT:    vor.vv v30, v30, v10
-; RV32V-NEXT:    vsrl.vi v10, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v0, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v10, v10, v0
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v10, v10, v8
-; RV32V-NEXT:    vsrl.vi v8, v30, 1
-; RV32V-NEXT:    vand.vv v30, v30, v0
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vadd.vv v30, v30, v30
-; RV32V-NEXT:    vor.vv v8, v8, v30
-; RV32V-NEXT:    addi a3, sp, 208
-; RV32V-NEXT:    vlse64.v v30, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v22
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v26
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 200
-; RV32V-NEXT:    addi a1, sp, 192
-; RV32V-NEXT:    addi a0, sp, 184
-; RV32V-NEXT:    vlse64.v v12, (a3), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a0), zero
-; RV32V-NEXT:    vand.vv v18, v8, v30
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v18, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a2, sp, 176
-; RV32V-NEXT:    addi a1, sp, 168
-; RV32V-NEXT:    addi a3, sp, 160
-; RV32V-NEXT:    addi a0, sp, 152
-; RV32V-NEXT:    vlse64.v v12, (a2), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a3), zero
-; RV32V-NEXT:    vlse64.v v18, (a0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 144
-; RV32V-NEXT:    addi a1, sp, 136
-; RV32V-NEXT:    addi a2, sp, 128
-; RV32V-NEXT:    addi a3, sp, 120
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 112
-; RV32V-NEXT:    addi a1, sp, 104
-; RV32V-NEXT:    addi a2, sp, 96
-; RV32V-NEXT:    addi a3, sp, 88
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 80
-; RV32V-NEXT:    addi a1, sp, 72
-; RV32V-NEXT:    addi a2, sp, 64
-; RV32V-NEXT:    addi a3, sp, 56
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 48
-; RV32V-NEXT:    addi a1, sp, 40
-; RV32V-NEXT:    addi a2, sp, 32
-; RV32V-NEXT:    addi a3, sp, 24
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v14, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a2), zero
-; RV32V-NEXT:    vlse64.v v18, (a3), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v14
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v18
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v18, v8, a0
-; RV32V-NEXT:    vand.vx v20, v8, s5
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v22, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v26, v8, a0
-; RV32V-NEXT:    vand.vx v28, v8, s6
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vand.vx v30, v8, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vand.vx v6, v8, a0
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vand.vx v4, v8, a0
-; RV32V-NEXT:    vand.vx v2, v8, t6
-; RV32V-NEXT:    vand.vx v0, v8, s8
-; RV32V-NEXT:    vand.vx v12, v8, s0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, a5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, a4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, ra
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s3
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vand.vi v12, v8, 2
-; RV32V-NEXT:    vand.vi v14, v8, 1
-; RV32V-NEXT:    vand.vi v16, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v12, v10, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v12, v10, v14
-; RV32V-NEXT:    vmul.vv v14, v10, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v14, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v10, v8
-; RV32V-NEXT:    vmul.vv v18, v10, v18
-; RV32V-NEXT:    vmul.vv v20, v10, v20
-; RV32V-NEXT:    vmul.vv v22, v10, v22
-; RV32V-NEXT:    vmul.vv v24, v10, v24
-; RV32V-NEXT:    vmul.vv v26, v10, v26
-; RV32V-NEXT:    vmul.vv v28, v10, v28
-; RV32V-NEXT:    vmul.vv v30, v10, v30
-; RV32V-NEXT:    vmul.vv v6, v10, v6
-; RV32V-NEXT:    vmul.vv v4, v10, v4
-; RV32V-NEXT:    vmul.vv v2, v10, v2
-; RV32V-NEXT:    vmul.vv v0, v10, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v10, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v10, v10, v8
-; RV32V-NEXT:    vxor.vi v8, v12, 0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    vsrl.vx v10, v8, t0
-; RV32V-NEXT:    vsll.vx v12, v8, t0
-; RV32V-NEXT:    vsrl.vx v14, v8, a6
-; RV32V-NEXT:    vand.vx v16, v8, t4
-; RV32V-NEXT:    vand.vx v14, v14, t4
-; RV32V-NEXT:    vsrl.vi v18, v8, 24
-; RV32V-NEXT:    vand.vx v20, v8, t1
-; RV32V-NEXT:    vand.vx v18, v18, t1
-; RV32V-NEXT:    vsll.vx v16, v16, a6
-; RV32V-NEXT:    vsrl.vi v22, v8, 8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v24, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v22, v22, v24
-; RV32V-NEXT:    vor.vv v10, v14, v10
-; RV32V-NEXT:    vor.vv v14, v22, v18
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vsll.vi v18, v20, 24
-; RV32V-NEXT:    vor.vv v8, v18, v8
-; RV32V-NEXT:    vor.vv v12, v12, v16
-; RV32V-NEXT:    vor.vv v10, v14, v10
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vor.vv v8, v8, v10
-; RV32V-NEXT:    vsrl.vi v10, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v12
-; RV32V-NEXT:    vand.vv v10, v10, v12
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vsrl.vi v10, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v12
-; RV32V-NEXT:    vand.vv v10, v10, v12
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vsrl.vi v10, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v12
-; RV32V-NEXT:    vand.vv v10, v10, v12
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 368
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv2i64_vx:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    li a1, 56
-; RV64V-NEXT:    lui t2, 16
-; RV64V-NEXT:    lui a2, 4080
-; RV64V-NEXT:    li t0, 255
-; RV64V-NEXT:    lui a3, 61681
-; RV64V-NEXT:    lui a4, 209715
-; RV64V-NEXT:    lui a5, 349525
-; RV64V-NEXT:    srli a6, a0, 24
-; RV64V-NEXT:    srli a7, a0, 8
-; RV64V-NEXT:    srli t1, a0, 40
-; RV64V-NEXT:    srli t3, a0, 56
-; RV64V-NEXT:    addi a3, a3, -241
-; RV64V-NEXT:    addi a4, a4, 819
-; RV64V-NEXT:    addi t4, a5, 1365
-; RV64V-NEXT:    slli a5, a3, 32
-; RV64V-NEXT:    add a5, a3, a5
-; RV64V-NEXT:    slli a3, a4, 32
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, t4, 32
-; RV64V-NEXT:    add a3, t4, a3
-; RV64V-NEXT:    srliw t4, a0, 24
-; RV64V-NEXT:    slli t0, t0, 24
-; RV64V-NEXT:    and a6, a6, a2
-; RV64V-NEXT:    and a7, a7, t0
-; RV64V-NEXT:    or t5, a7, a6
-; RV64V-NEXT:    addi a6, t2, -256
-; RV64V-NEXT:    and a7, t1, a6
-; RV64V-NEXT:    or t1, a7, t3
-; RV64V-NEXT:    and a7, a0, a2
-; RV64V-NEXT:    slli t4, t4, 32
-; RV64V-NEXT:    slli a7, a7, 24
-; RV64V-NEXT:    or t3, a7, t4
-; RV64V-NEXT:    li a7, 40
-; RV64V-NEXT:    vsetvli t4, zero, e64, m2, ta, ma
-; RV64V-NEXT:    vsrl.vi v12, v8, 24
-; RV64V-NEXT:    vsrl.vi v10, v8, 8
-; RV64V-NEXT:    or t1, t5, t1
-; RV64V-NEXT:    slli t4, a0, 56
-; RV64V-NEXT:    and a0, a0, a6
-; RV64V-NEXT:    slli a0, a0, 40
-; RV64V-NEXT:    or t4, t4, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    or t4, t4, t3
-; RV64V-NEXT:    lui t3, 1
-; RV64V-NEXT:    vsrl.vx v14, v8, a1
-; RV64V-NEXT:    vsrl.vx v16, v8, a7
-; RV64V-NEXT:    vand.vx v12, v12, a2
-; RV64V-NEXT:    vand.vx v18, v8, a2
-; RV64V-NEXT:    vsll.vx v20, v8, a1
-; RV64V-NEXT:    vand.vx v16, v16, a6
-; RV64V-NEXT:    vand.vx v10, v10, t0
-; RV64V-NEXT:    vsll.vi v18, v18, 24
-; RV64V-NEXT:    vor.vv v14, v16, v14
-; RV64V-NEXT:    vand.vx v16, v8, t0
-; RV64V-NEXT:    vand.vx v8, v8, a6
-; RV64V-NEXT:    vor.vv v10, v10, v12
-; RV64V-NEXT:    vsll.vi v12, v16, 8
-; RV64V-NEXT:    vsll.vx v8, v8, a7
-; RV64V-NEXT:    vor.vv v10, v10, v14
-; RV64V-NEXT:    vor.vv v12, v18, v12
-; RV64V-NEXT:    vor.vv v8, v20, v8
-; RV64V-NEXT:    vor.vv v8, v8, v12
-; RV64V-NEXT:    vor.vv v8, v8, v10
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v10, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, a5
-; RV64V-NEXT:    srli t4, t1, 4
-; RV64V-NEXT:    and t1, t1, a5
-; RV64V-NEXT:    vand.vx v10, v10, a5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    and t4, t4, a5
-; RV64V-NEXT:    slli t1, t1, 4
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v10, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, a4
-; RV64V-NEXT:    srli t4, t1, 2
-; RV64V-NEXT:    and t1, t1, a4
-; RV64V-NEXT:    vand.vx v10, v10, a4
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    and t4, t4, a4
-; RV64V-NEXT:    slli t1, t1, 2
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v10, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    srli t4, t1, 1
-; RV64V-NEXT:    and t1, t1, a3
-; RV64V-NEXT:    vand.vx v10, v10, a3
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    and t4, t4, a3
-; RV64V-NEXT:    slli t1, t1, 1
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    andi t4, t1, 2
-; RV64V-NEXT:    vmul.vx v10, v8, t4
-; RV64V-NEXT:    andi t4, t1, 1
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    andi t4, t1, 4
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    andi t4, t1, 8
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    andi t4, t1, 16
-; RV64V-NEXT:    vmul.vx v18, v8, t4
-; RV64V-NEXT:    andi t4, t1, 32
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    andi t4, t1, 64
-; RV64V-NEXT:    vxor.vv v10, v12, v10
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    andi t4, t1, 128
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    andi t4, t1, 256
-; RV64V-NEXT:    vxor.vv v10, v10, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    andi t4, t1, 512
-; RV64V-NEXT:    vxor.vv v10, v10, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t4
-; RV64V-NEXT:    andi t4, t1, 1024
-; RV64V-NEXT:    vxor.vv v10, v10, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    slli t4, a0, 11
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v10, v10, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    lui t4, 2
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v10, v10, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    lui t3, 4
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v10, v16
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t4
-; RV64V-NEXT:    lui t4, 8
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    lui t3, 32
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v16, v12
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    lui t4, 64
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t2
-; RV64V-NEXT:    lui t2, 128
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t3
-; RV64V-NEXT:    lui t3, 256
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    lui t4, 512
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t2
-; RV64V-NEXT:    lui t2, 1024
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    lui t3, 2048
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t4
-; RV64V-NEXT:    lui t4, 4096
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    lui t2, 8192
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t3
-; RV64V-NEXT:    lui t3, 16384
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v22, v8, t4
-; RV64V-NEXT:    lui t4, 32768
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t2
-; RV64V-NEXT:    lui t2, 65536
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v14, v12, v20
-; RV64V-NEXT:    vmul.vx v12, v8, t3
-; RV64V-NEXT:    lui t3, 131072
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v20, v14, v16
-; RV64V-NEXT:    vmul.vx v14, v8, t4
-; RV64V-NEXT:    lui t4, 262144
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v20, v22
-; RV64V-NEXT:    vxor.vv v18, v16, v18
-; RV64V-NEXT:    vmul.vx v16, v8, t2
-; RV64V-NEXT:    slli t2, a0, 32
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vsll.vx v10, v10, a1
-; RV64V-NEXT:    vand.vx v20, v20, a6
-; RV64V-NEXT:    vsll.vx v20, v20, a7
-; RV64V-NEXT:    vor.vv v10, v10, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    slli t3, a0, 33
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v18, v12
-; RV64V-NEXT:    vmul.vx v18, v8, t4
-; RV64V-NEXT:    slli t4, a0, 34
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t2
-; RV64V-NEXT:    slli t2, a0, 35
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t3
-; RV64V-NEXT:    slli t3, a0, 36
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v20, v12, v20
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    srliw t4, t1, 31
-; RV64V-NEXT:    slli t4, t4, 31
-; RV64V-NEXT:    vxor.vv v18, v20, v18
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    slli t4, a0, 37
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v18, v18, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    slli t2, a0, 38
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v14, v18, v14
-; RV64V-NEXT:    vmul.vx v18, v8, t3
-; RV64V-NEXT:    slli t3, a0, 39
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    slli t4, a0, 40
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v14, v12
-; RV64V-NEXT:    vmul.vx v14, v8, t2
-; RV64V-NEXT:    slli t2, a0, 41
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    slli t3, a0, 42
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t4
-; RV64V-NEXT:    slli t4, a0, 43
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t2
-; RV64V-NEXT:    slli t2, a0, 44
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v12, v14
-; RV64V-NEXT:    vmul.vx v14, v8, t3
-; RV64V-NEXT:    slli t3, a0, 45
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    slli t4, a0, 46
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v18, v12, v18
-; RV64V-NEXT:    vxor.vv v16, v18, v16
-; RV64V-NEXT:    vmul.vx v18, v8, t2
-; RV64V-NEXT:    slli t2, a0, 47
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v14, v16, v14
-; RV64V-NEXT:    vmul.vx v16, v8, t3
-; RV64V-NEXT:    slli t3, a0, 48
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v14, v14, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    slli t4, a0, 49
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v14, v14, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t2
-; RV64V-NEXT:    slli t2, a0, 50
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v14, v14, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t3
-; RV64V-NEXT:    slli t3, a0, 51
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v14, v14, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    slli t4, a0, 52
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v14, v14, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t2
-; RV64V-NEXT:    slli t2, a0, 53
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v14, v16
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    slli t3, a0, 54
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    vsrl.vi v18, v12, 8
-; RV64V-NEXT:    vand.vx v18, v18, t0
-; RV64V-NEXT:    vsrl.vi v14, v14, 24
-; RV64V-NEXT:    vand.vx v14, v14, a2
-; RV64V-NEXT:    vor.vv v14, v18, v14
-; RV64V-NEXT:    vmul.vx v18, v8, t4
-; RV64V-NEXT:    slli t4, a0, 55
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    slli t2, a0, 56
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t3
-; RV64V-NEXT:    slli t3, a0, 57
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    slli t4, a0, 58
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v18
-; RV64V-NEXT:    vmul.vx v18, v8, t2
-; RV64V-NEXT:    slli t2, a0, 59
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    slli t3, a0, 60
-; RV64V-NEXT:    vand.vx v12, v12, a2
-; RV64V-NEXT:    slli a2, a0, 61
-; RV64V-NEXT:    slli a0, a0, 62
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    and a2, t1, a2
-; RV64V-NEXT:    and a0, t1, a0
-; RV64V-NEXT:    srli t1, t1, 63
-; RV64V-NEXT:    vsll.vi v12, v12, 24
-; RV64V-NEXT:    vxor.vv v18, v16, v18
-; RV64V-NEXT:    vxor.vv v18, v18, v20
-; RV64V-NEXT:    vand.vx v20, v16, t0
-; RV64V-NEXT:    vsll.vi v20, v20, 8
-; RV64V-NEXT:    vor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    vxor.vv v18, v18, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    vor.vv v10, v10, v12
-; RV64V-NEXT:    vmul.vx v12, v8, t3
-; RV64V-NEXT:    vxor.vv v18, v18, v20
-; RV64V-NEXT:    vmul.vx v20, v8, a2
-; RV64V-NEXT:    vxor.vv v12, v18, v12
-; RV64V-NEXT:    vmul.vx v18, v8, a0
-; RV64V-NEXT:    slli t1, t1, 63
-; RV64V-NEXT:    vmul.vx v8, v8, t1
-; RV64V-NEXT:    vsrl.vx v16, v16, a7
-; RV64V-NEXT:    vand.vx v16, v16, a6
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vxor.vv v12, v12, v18
-; RV64V-NEXT:    vxor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vx v8, v8, a1
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vor.vv v8, v14, v8
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, a5
-; RV64V-NEXT:    vand.vx v10, v10, a5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, a4
-; RV64V-NEXT:    vand.vx v10, v10, a4
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v10, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    vand.vx v10, v10, a3
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vsrl.vi v8, v8, 1
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC-LABEL: clmulh_nxv2i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v10, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v10
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
-;
-; RV64ZVBC-LABEL: clmulh_nxv2i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i128 0
-  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-  %va.ext = zext <vscale x 2 x i64> %va to <vscale x 2 x i128>
-  %vb.ext = zext <vscale x 2 x i64> %vb to <vscale x 2 x i128>
-  %clmul = call <vscale x 2 x i128> @llvm.clmul.nxv2i128(<vscale x 2 x i128> %va.ext, <vscale x 2 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 2 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 2 x i128> %res.ext to <vscale x 2 x i64>
-  ret <vscale x 2 x i64> %res
-}
-
-define <vscale x 4 x i64> @clmulh_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv4i64_vv:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vmv4r.v v28, v12
-; RV32V-NEXT:    lui s11, 1044480
-; RV32V-NEXT:    lui t6, 524288
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    li ra, 2
-; RV32V-NEXT:    li t4, 4
-; RV32V-NEXT:    li t2, 8
-; RV32V-NEXT:    li t5, 16
-; RV32V-NEXT:    li t3, 32
-; RV32V-NEXT:    li t1, 64
-; RV32V-NEXT:    li t0, 128
-; RV32V-NEXT:    li a7, 256
-; RV32V-NEXT:    li a6, 512
-; RV32V-NEXT:    li a3, 1024
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    lui a4, 2
-; RV32V-NEXT:    lui a1, 4
-; RV32V-NEXT:    lui a5, 8
-; RV32V-NEXT:    lui s0, 16
-; RV32V-NEXT:    lui s1, 32
-; RV32V-NEXT:    lui s2, 64
-; RV32V-NEXT:    lui s3, 128
-; RV32V-NEXT:    lui s4, 256
-; RV32V-NEXT:    lui s5, 512
-; RV32V-NEXT:    lui s6, 1024
-; RV32V-NEXT:    lui s7, 2048
-; RV32V-NEXT:    lui s8, 4096
-; RV32V-NEXT:    lui s9, 8192
-; RV32V-NEXT:    lui s10, 16384
-; RV32V-NEXT:    sw s11, 272(sp)
-; RV32V-NEXT:    lui s11, 32768
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw t6, 264(sp)
-; RV32V-NEXT:    sw zero, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a0, 260(sp)
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw ra, 252(sp)
-; RV32V-NEXT:    lui ra, 65536
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw t4, 244(sp)
-; RV32V-NEXT:    lui t4, 131072
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw t2, 236(sp)
-; RV32V-NEXT:    lui t2, 262144
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw t5, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw t3, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw t1, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw t0, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw a7, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw a6, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a3, 180(sp)
-; RV32V-NEXT:    slli a3, a0, 11
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw a3, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw a2, 164(sp)
-; RV32V-NEXT:    lui t1, 1
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw a4, 156(sp)
-; RV32V-NEXT:    lui t3, 2
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw a1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw a5, 140(sp)
-; RV32V-NEXT:    lui t5, 8
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s0, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw s1, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw s2, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw s3, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw s4, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw s5, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw s6, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw s7, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw s8, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s9, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw s10, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s11, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw ra, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw t4, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw t2, 20(sp)
-; RV32V-NEXT:    sw zero, 8(sp)
-; RV32V-NEXT:    sw t6, 12(sp)
-; RV32V-NEXT:    addi a1, sp, 272
-; RV32V-NEXT:    vlse64.v v12, (a1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a6, 56
-; RV32V-NEXT:    vsrl.vx v20, v8, a6
-; RV32V-NEXT:    li a5, 40
-; RV32V-NEXT:    vsrl.vx v24, v8, a5
-; RV32V-NEXT:    vsll.vx v16, v8, a6
-; RV32V-NEXT:    vsrl.vx v12, v28, a6
-; RV32V-NEXT:    vsrl.vx v4, v28, a5
-; RV32V-NEXT:    addi a2, s0, -256
-; RV32V-NEXT:    vand.vx v24, v24, a2
-; RV32V-NEXT:    vor.vv v24, v24, v20
-; RV32V-NEXT:    vsll.vx v0, v28, a6
-; RV32V-NEXT:    vand.vx v20, v4, a2
-; RV32V-NEXT:    vor.vv v12, v20, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v20, v8, a2
-; RV32V-NEXT:    vsll.vx v20, v20, a5
-; RV32V-NEXT:    vor.vv v20, v16, v20
-; RV32V-NEXT:    vand.vx v16, v28, a2
-; RV32V-NEXT:    vsll.vx v16, v16, a5
-; RV32V-NEXT:    vor.vv v12, v0, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vsrl.vi v4, v8, 24
-; RV32V-NEXT:    lui a4, 4080
-; RV32V-NEXT:    vand.vx v4, v4, a4
-; RV32V-NEXT:    vsrl.vi v0, v8, 8
-; RV32V-NEXT:    vmv4r.v v12, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v0, v0, v8
-; RV32V-NEXT:    vor.vv v4, v0, v4
-; RV32V-NEXT:    vsrl.vi v0, v28, 24
-; RV32V-NEXT:    vand.vx v0, v0, a4
-; RV32V-NEXT:    vsrl.vi v16, v28, 8
-; RV32V-NEXT:    vand.vv v16, v16, v8
-; RV32V-NEXT:    vor.vv v16, v16, v0
-; RV32V-NEXT:    vor.vv v24, v4, v24
-; RV32V-NEXT:    vand.vx v4, v12, a4
-; RV32V-NEXT:    vsll.vi v4, v4, 24
-; RV32V-NEXT:    vand.vv v12, v12, v8
-; RV32V-NEXT:    vsll.vi v12, v12, 8
-; RV32V-NEXT:    vor.vv v12, v4, v12
-; RV32V-NEXT:    lui a7, 61681
-; RV32V-NEXT:    addi a7, a7, -241
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vor.vv v16, v16, v4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v16, v28, a4
-; RV32V-NEXT:    vsll.vi v16, v16, 24
-; RV32V-NEXT:    vand.vv v4, v28, v8
-; RV32V-NEXT:    vsll.vi v4, v4, 8
-; RV32V-NEXT:    vor.vv v16, v16, v4
-; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v28, a7
-; RV32V-NEXT:    lui a7, 209715
-; RV32V-NEXT:    addi a7, a7, 819
-; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vor.vv v12, v20, v12
-; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v0, a7
-; RV32V-NEXT:    lui a7, 349525
-; RV32V-NEXT:    addi a7, a7, 1365
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vor.vv v16, v8, v16
-; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v4, a7
-; RV32V-NEXT:    vsetvli a7, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vor.vv v12, v12, v24
-; RV32V-NEXT:    addi a7, sp, 264
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v12, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v12, v28
-; RV32V-NEXT:    vand.vv v16, v16, v28
-; RV32V-NEXT:    vsll.vi v12, v12, 4
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vsrl.vi v16, v8, 4
-; RV32V-NEXT:    vand.vv v8, v8, v28
-; RV32V-NEXT:    vand.vv v16, v16, v28
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v12, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v0, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v12, v0
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vsll.vi v12, v12, 2
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vsrl.vi v16, v8, 2
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v12, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v12, v4
-; RV32V-NEXT:    vand.vv v16, v16, v4
-; RV32V-NEXT:    vadd.vv v12, v12, v12
-; RV32V-NEXT:    vor.vv v24, v16, v12
-; RV32V-NEXT:    vsrl.vi v12, v8, 1
-; RV32V-NEXT:    vand.vv v8, v8, v4
-; RV32V-NEXT:    vand.vv v12, v12, v4
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    addi a7, sp, 256
-; RV32V-NEXT:    vlse64.v v12, (a7), zero
-; RV32V-NEXT:    vand.vv v16, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a7, sp, 248
-; RV32V-NEXT:    addi t0, sp, 240
-; RV32V-NEXT:    addi a0, sp, 232
-; RV32V-NEXT:    vlse64.v v16, (a7), zero
-; RV32V-NEXT:    vlse64.v v20, (t0), zero
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 224
-; RV32V-NEXT:    addi a7, sp, 216
-; RV32V-NEXT:    addi t0, sp, 208
-; RV32V-NEXT:    addi a0, sp, 200
-; RV32V-NEXT:    vlse64.v v12, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a7), zero
-; RV32V-NEXT:    vlse64.v v20, (t0), zero
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 192
-; RV32V-NEXT:    addi a1, sp, 184
-; RV32V-NEXT:    addi a7, sp, 176
-; RV32V-NEXT:    addi t0, sp, 168
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 160
-; RV32V-NEXT:    addi a1, sp, 152
-; RV32V-NEXT:    addi a7, sp, 144
-; RV32V-NEXT:    addi t0, sp, 136
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 128
-; RV32V-NEXT:    addi a1, sp, 120
-; RV32V-NEXT:    addi a7, sp, 112
-; RV32V-NEXT:    addi t0, sp, 104
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 96
-; RV32V-NEXT:    addi a1, sp, 88
-; RV32V-NEXT:    addi a7, sp, 80
-; RV32V-NEXT:    addi t0, sp, 72
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 64
-; RV32V-NEXT:    addi a1, sp, 56
-; RV32V-NEXT:    addi a7, sp, 48
-; RV32V-NEXT:    addi t0, sp, 40
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 32
-; RV32V-NEXT:    addi a1, sp, 24
-; RV32V-NEXT:    addi a7, sp, 16
-; RV32V-NEXT:    addi t0, sp, 8
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v28, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vand.vx v4, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v0, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, a3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    lui a0, 4
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, ra
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t2
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vi v12, v8, 2
-; RV32V-NEXT:    vand.vi v16, v8, 1
-; RV32V-NEXT:    vand.vi v20, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v12, v24, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v12, v24, v16
-; RV32V-NEXT:    vmul.vv v16, v24, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v20, v24, v8
-; RV32V-NEXT:    vmul.vv v28, v24, v28
-; RV32V-NEXT:    vmul.vv v4, v24, v4
-; RV32V-NEXT:    vmul.vv v0, v24, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v24, v8
-; RV32V-NEXT:    vxor.vi v8, v12, 0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vsrl.vx v12, v8, a6
-; RV32V-NEXT:    vsrl.vx v16, v8, a5
-; RV32V-NEXT:    vsrl.vi v20, v8, 24
-; RV32V-NEXT:    vand.vx v16, v16, a2
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vsrl.vi v16, v8, 8
-; RV32V-NEXT:    vand.vx v20, v20, a4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v16, v16, v24
-; RV32V-NEXT:    vor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a4
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    vsll.vi v24, v24, 8
-; RV32V-NEXT:    vsll.vi v20, v20, 24
-; RV32V-NEXT:    vor.vv v20, v20, v24
-; RV32V-NEXT:    vsll.vx v24, v8, a6
-; RV32V-NEXT:    vand.vx v8, v8, a2
-; RV32V-NEXT:    vsll.vx v8, v8, a5
-; RV32V-NEXT:    vor.vv v8, v24, v8
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vor.vv v8, v8, v20
-; RV32V-NEXT:    vor.vv v8, v8, v12
-; RV32V-NEXT:    vsrl.vi v12, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v12, v12, v16
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vsrl.vi v12, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v12, v12, v16
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vsrl.vi v12, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v12, v12, v16
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 352
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv4i64_vv:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi sp, sp, -384
-; RV64V-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s1, 360(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s2, 352(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s3, 344(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s4, 336(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s5, 328(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s6, 320(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s7, 312(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s8, 304(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s9, 296(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s10, 288(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s11, 280(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    sub sp, sp, a0
-; RV64V-NEXT:    lui a5, 16
-; RV64V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vsrl.vi v24, v8, 24
-; RV64V-NEXT:    vsrl.vi v16, v8, 8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    li s3, 255
-; RV64V-NEXT:    lui a0, 61681
-; RV64V-NEXT:    lui a1, 209715
-; RV64V-NEXT:    lui a2, 349525
-; RV64V-NEXT:    vsrl.vi v28, v12, 24
-; RV64V-NEXT:    li a3, 56
-; RV64V-NEXT:    vsrl.vx v16, v8, a3
-; RV64V-NEXT:    li a4, 40
-; RV64V-NEXT:    vsrl.vx v20, v8, a4
-; RV64V-NEXT:    addi t3, a5, -256
-; RV64V-NEXT:    vsrl.vx v4, v12, a3
-; RV64V-NEXT:    vand.vx v20, v20, t3
-; RV64V-NEXT:    vor.vv v20, v20, v16
-; RV64V-NEXT:    vsrl.vx v16, v12, a4
-; RV64V-NEXT:    li t4, 40
-; RV64V-NEXT:    vand.vx v16, v16, t3
-; RV64V-NEXT:    vor.vv v0, v16, v4
-; RV64V-NEXT:    vsrl.vi v16, v12, 8
-; RV64V-NEXT:    li a3, 16
-; RV64V-NEXT:    li a6, 32
-; RV64V-NEXT:    li a5, 64
-; RV64V-NEXT:    li a4, 128
-; RV64V-NEXT:    li t0, 256
-; RV64V-NEXT:    li a7, 512
-; RV64V-NEXT:    li t2, 1
-; RV64V-NEXT:    lui t1, 4080
-; RV64V-NEXT:    vand.vx v4, v24, t1
-; RV64V-NEXT:    slli s3, s3, 24
-; RV64V-NEXT:    vand.vx v24, v28, t1
-; RV64V-NEXT:    vand.vx v16, v16, s3
-; RV64V-NEXT:    vor.vv v16, v16, v24
-; RV64V-NEXT:    vand.vx v24, v12, t1
-; RV64V-NEXT:    lui t5, 4080
-; RV64V-NEXT:    vsll.vi v28, v24, 24
-; RV64V-NEXT:    vor.vv v24, v16, v0
-; RV64V-NEXT:    vand.vx v16, v12, s3
-; RV64V-NEXT:    vsll.vi v16, v16, 8
-; RV64V-NEXT:    vor.vv v16, v28, v16
-; RV64V-NEXT:    li t1, 56
-; RV64V-NEXT:    vsll.vx v28, v12, t1
-; RV64V-NEXT:    vand.vx v12, v12, t3
-; RV64V-NEXT:    mv s0, t3
-; RV64V-NEXT:    sd t3, 224(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vsll.vx v12, v12, t4
-; RV64V-NEXT:    vor.vv v12, v28, v12
-; RV64V-NEXT:    vand.vx v0, v8, t5
-; RV64V-NEXT:    vor.vv v12, v12, v16
-; RV64V-NEXT:    vsll.vx v28, v8, t1
-; RV64V-NEXT:    addi t3, a0, -241
-; RV64V-NEXT:    addi t5, a1, 819
-; RV64V-NEXT:    addi t6, a2, 1365
-; RV64V-NEXT:    slli a0, t2, 11
-; RV64V-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 31
-; RV64V-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 32
-; RV64V-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 33
-; RV64V-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 34
-; RV64V-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 35
-; RV64V-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 36
-; RV64V-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 37
-; RV64V-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 38
-; RV64V-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 39
-; RV64V-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 40
-; RV64V-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t3, 32
-; RV64V-NEXT:    add t3, t3, a0
-; RV64V-NEXT:    slli a0, t5, 32
-; RV64V-NEXT:    add t5, t5, a0
-; RV64V-NEXT:    slli a0, t6, 32
-; RV64V-NEXT:    add a0, t6, a0
-; RV64V-NEXT:    slli a1, t2, 41
-; RV64V-NEXT:    sd a1, 128(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vor.vv v12, v12, v24
-; RV64V-NEXT:    vsrl.vi v16, v12, 4
-; RV64V-NEXT:    vand.vx v12, v12, t3
-; RV64V-NEXT:    vand.vx v16, v16, t3
-; RV64V-NEXT:    vsll.vi v12, v12, 4
-; RV64V-NEXT:    vor.vv v12, v16, v12
-; RV64V-NEXT:    vsrl.vi v16, v12, 2
-; RV64V-NEXT:    vand.vx v12, v12, t5
-; RV64V-NEXT:    vand.vx v16, v16, t5
-; RV64V-NEXT:    vsll.vi v12, v12, 2
-; RV64V-NEXT:    vor.vv v12, v16, v12
-; RV64V-NEXT:    vsrl.vi v16, v12, 1
-; RV64V-NEXT:    vand.vx v12, v12, a0
-; RV64V-NEXT:    vand.vx v16, v16, a0
-; RV64V-NEXT:    vadd.vv v12, v12, v12
-; RV64V-NEXT:    vor.vv v24, v16, v12
-; RV64V-NEXT:    vand.vx v12, v24, a3
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 272
-; RV64V-NEXT:    vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    slli a1, t2, 42
-; RV64V-NEXT:    sd a1, 120(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 272
-; RV64V-NEXT:    vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    sd s3, 232(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v12, s3
-; RV64V-NEXT:    vor.vv v12, v12, v4
-; RV64V-NEXT:    vand.vx v16, v24, a6
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 272
-; RV64V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    slli a1, t2, 43
-; RV64V-NEXT:    sd a1, 112(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vsll.vi v16, v0, 24
-; RV64V-NEXT:    vor.vv v12, v12, v20
-; RV64V-NEXT:    vand.vx v20, v8, s3
-; RV64V-NEXT:    vsll.vi v20, v20, 8
-; RV64V-NEXT:    vor.vv v16, v16, v20
-; RV64V-NEXT:    vand.vx v0, v24, a5
-; RV64V-NEXT:    slli a1, t2, 44
-; RV64V-NEXT:    sd a1, 104(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, s0
-; RV64V-NEXT:    vsll.vx v8, v8, t4
-; RV64V-NEXT:    vor.vv v8, v28, v8
-; RV64V-NEXT:    vand.vx v20, v24, a4
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 272
-; RV64V-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    slli a1, t2, 45
-; RV64V-NEXT:    sd a1, 96(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vor.vv v8, v8, v16
-; RV64V-NEXT:    vand.vx v16, v24, t0
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 272
-; RV64V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    slli a1, t2, 46
-; RV64V-NEXT:    sd a1, 88(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vor.vv v8, v8, v12
-; RV64V-NEXT:    vsrl.vi v12, v8, 4
-; RV64V-NEXT:    sd t3, 240(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, t3
-; RV64V-NEXT:    vand.vx v12, v12, t3
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 2
-; RV64V-NEXT:    sd t5, 248(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, t5
-; RV64V-NEXT:    vand.vx v12, v12, t5
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 1
-; RV64V-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v12, v12, a0
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vand.vx v12, v24, a7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 47
-; RV64V-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 48
-; RV64V-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 49
-; RV64V-NEXT:    sd a0, 64(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 50
-; RV64V-NEXT:    sd a0, 56(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 51
-; RV64V-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 52
-; RV64V-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 53
-; RV64V-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 54
-; RV64V-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli ra, t2, 55
-; RV64V-NEXT:    slli s11, t2, 56
-; RV64V-NEXT:    slli s10, t2, 57
-; RV64V-NEXT:    slli s9, t2, 58
-; RV64V-NEXT:    slli s8, t2, 59
-; RV64V-NEXT:    slli s6, t2, 60
-; RV64V-NEXT:    slli s7, t2, 61
-; RV64V-NEXT:    slli s5, t2, 62
-; RV64V-NEXT:    li a0, -1
-; RV64V-NEXT:    slli s4, a0, 63
-; RV64V-NEXT:    li a0, 1024
-; RV64V-NEXT:    lui a1, 1
-; RV64V-NEXT:    lui a2, 2
-; RV64V-NEXT:    lui a3, 4
-; RV64V-NEXT:    lui a4, 8
-; RV64V-NEXT:    lui a5, 32
-; RV64V-NEXT:    lui a6, 64
-; RV64V-NEXT:    lui a7, 128
-; RV64V-NEXT:    lui t0, 256
-; RV64V-NEXT:    lui t1, 512
-; RV64V-NEXT:    lui t2, 1024
-; RV64V-NEXT:    lui t3, 2048
-; RV64V-NEXT:    lui t4, 4096
-; RV64V-NEXT:    lui t5, 8192
-; RV64V-NEXT:    lui t6, 16384
-; RV64V-NEXT:    lui s0, 32768
-; RV64V-NEXT:    lui s1, 65536
-; RV64V-NEXT:    lui s2, 131072
-; RV64V-NEXT:    lui s3, 262144
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv s8, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add s8, s8, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, s8
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv s8, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, s8
-; RV64V-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, a1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, a2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, a3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, a4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, a5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, a6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, a7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 8
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, t0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, t1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, t2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, t3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, t4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, t5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, t6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v12, v24, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, ra
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s11
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s10
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s9
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v12, v24, s6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vi v12, v24, 2
-; RV64V-NEXT:    vand.vi v16, v24, 1
-; RV64V-NEXT:    vand.vi v20, v24, 4
-; RV64V-NEXT:    vand.vi v28, v24, 8
-; RV64V-NEXT:    vand.vx v4, v24, s7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v4, v24, s5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vand.vx v24, v24, s4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v12, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v16, v8, v20
-; RV64V-NEXT:    vmul.vv v20, v8, v28
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v24, v8, v24
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v28, v8, v12
-; RV64V-NEXT:    vmul.vv v4, v8, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v0, v8, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 8
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    addi a0, sp, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 8
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v12, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v8, v8, v12
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v8, v12
-; RV64V-NEXT:    vxor.vv v12, v12, v16
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vxor.vv v12, v12, v24
-; RV64V-NEXT:    vxor.vv v12, v12, v28
-; RV64V-NEXT:    vxor.vv v12, v12, v4
-; RV64V-NEXT:    vxor.vv v12, v12, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v12, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v16, v8
-; RV64V-NEXT:    addi a0, sp, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v8
-; RV64V-NEXT:    li a2, 56
-; RV64V-NEXT:    vsll.vx v12, v12, a2
-; RV64V-NEXT:    ld a1, 224(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v16, a1
-; RV64V-NEXT:    li a0, 40
-; RV64V-NEXT:    vsll.vx v16, v16, a0
-; RV64V-NEXT:    vor.vv v12, v12, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 6
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v16, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v20, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 8
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 5
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v16, v20
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 6
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v20, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v20, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v28, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v28, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    vsrl.vi v28, v16, 8
-; RV64V-NEXT:    ld a4, 232(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v28, v28, a4
-; RV64V-NEXT:    vsrl.vi v20, v20, 24
-; RV64V-NEXT:    lui a3, 4080
-; RV64V-NEXT:    vand.vx v20, v20, a3
-; RV64V-NEXT:    vor.vv v20, v28, v20
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 5
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 272
-; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 272
-; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 3
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 272
-; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 272
-; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 272
-; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 2
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 5
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 272
-; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v28, v24, v28
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 3
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 5
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 272
-; RV64V-NEXT:    vl4r.v v4, (a5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v28, v28, v4
-; RV64V-NEXT:    vand.vx v16, v16, a3
-; RV64V-NEXT:    vsll.vi v16, v16, 24
-; RV64V-NEXT:    vand.vx v4, v24, a4
-; RV64V-NEXT:    vsll.vi v4, v4, 8
-; RV64V-NEXT:    vor.vv v16, v16, v4
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v4, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v28, v28, v4
-; RV64V-NEXT:    vor.vv v12, v12, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 272
-; RV64V-NEXT:    vl4r.v v16, (a3) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v28, v16
-; RV64V-NEXT:    vsrl.vx v24, v24, a0
-; RV64V-NEXT:    vand.vx v24, v24, a1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v28
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v28
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v28
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 272
-; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vx v8, v8, a2
-; RV64V-NEXT:    vor.vv v8, v24, v8
-; RV64V-NEXT:    vor.vv v8, v20, v8
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 4
-; RV64V-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v12, v12, a0
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 2
-; RV64V-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v12, v12, a0
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 1
-; RV64V-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v12, v12, a0
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v8, v8, 1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add sp, sp, a0
-; RV64V-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s1, 360(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s2, 352(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s3, 344(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s4, 336(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s5, 328(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s6, 320(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s7, 312(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s8, 304(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s9, 296(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s10, 288(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s11, 280(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    addi sp, sp, 384
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC-LABEL: clmulh_nxv4i64_vv:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v12
-; RV32ZVBC-NEXT:    ret
-;
-; RV64ZVBC-LABEL: clmulh_nxv4i64_vv:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v12
-; RV64ZVBC-NEXT:    ret
-  %va.ext = zext <vscale x 4 x i64> %va to <vscale x 4 x i128>
-  %vb.ext = zext <vscale x 4 x i64> %vb to <vscale x 4 x i128>
-  %clmul = call <vscale x 4 x i128> @llvm.clmul.nxv4i128(<vscale x 4 x i128> %va.ext, <vscale x 4 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 4 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 4 x i128> %res.ext to <vscale x 4 x i64>
-  ret <vscale x 4 x i64> %res
-}
-
-define <vscale x 4 x i64> @clmulh_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv4i64_vx:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -368
-; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 8
-; RV32V-NEXT:    sub sp, sp, a2
-; RV32V-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vmv4r.v v0, v8
-; RV32V-NEXT:    sw a0, 16(sp)
-; RV32V-NEXT:    sw a1, 20(sp)
-; RV32V-NEXT:    addi s10, sp, 16
-; RV32V-NEXT:    lui s11, 1044480
-; RV32V-NEXT:    lui s0, 524288
-; RV32V-NEXT:    li a0, 1
-; RV32V-NEXT:    li ra, 2
-; RV32V-NEXT:    li t5, 4
-; RV32V-NEXT:    li t3, 8
-; RV32V-NEXT:    li t6, 16
-; RV32V-NEXT:    li t4, 32
-; RV32V-NEXT:    li t2, 64
-; RV32V-NEXT:    li t1, 128
-; RV32V-NEXT:    li t0, 256
-; RV32V-NEXT:    li a7, 512
-; RV32V-NEXT:    li a3, 1024
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    lui a4, 2
-; RV32V-NEXT:    lui a1, 4
-; RV32V-NEXT:    lui a5, 8
-; RV32V-NEXT:    lui s1, 16
-; RV32V-NEXT:    lui a6, 32
-; RV32V-NEXT:    lui s2, 64
-; RV32V-NEXT:    lui s3, 128
-; RV32V-NEXT:    lui s4, 256
-; RV32V-NEXT:    lui s5, 512
-; RV32V-NEXT:    lui s6, 1024
-; RV32V-NEXT:    lui s7, 2048
-; RV32V-NEXT:    lui s8, 4096
-; RV32V-NEXT:    lui s9, 8192
-; RV32V-NEXT:    vlse64.v v4, (s10), zero
-; RV32V-NEXT:    lui s10, 16384
-; RV32V-NEXT:    sw s11, 288(sp)
-; RV32V-NEXT:    lui s11, 32768
-; RV32V-NEXT:    sw zero, 292(sp)
-; RV32V-NEXT:    sw s0, 280(sp)
-; RV32V-NEXT:    sw zero, 284(sp)
-; RV32V-NEXT:    sw zero, 272(sp)
-; RV32V-NEXT:    sw a0, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw ra, 268(sp)
-; RV32V-NEXT:    lui ra, 65536
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw t5, 260(sp)
-; RV32V-NEXT:    lui t5, 131072
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw t3, 252(sp)
-; RV32V-NEXT:    lui t3, 262144
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw t6, 244(sp)
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw t4, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw t2, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw t1, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw t0, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw a7, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw a3, 196(sp)
-; RV32V-NEXT:    slli a3, a0, 11
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw a3, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a2, 180(sp)
-; RV32V-NEXT:    lui t1, 1
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw a4, 172(sp)
-; RV32V-NEXT:    lui t4, 2
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw a1, 164(sp)
-; RV32V-NEXT:    lui t2, 4
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw a5, 156(sp)
-; RV32V-NEXT:    lui t6, 8
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw a6, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw s2, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw s3, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw s4, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw s5, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw s6, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw s7, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw s8, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw s9, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw s10, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s11, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw ra, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw t5, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw t3, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw s0, 28(sp)
-; RV32V-NEXT:    addi a1, sp, 288
-; RV32V-NEXT:    vlse64.v v12, (a1), zero
-; RV32V-NEXT:    li a6, 56
-; RV32V-NEXT:    vsrl.vx v16, v8, a6
-; RV32V-NEXT:    li a5, 40
-; RV32V-NEXT:    vsrl.vx v20, v8, a5
-; RV32V-NEXT:    vsll.vx v24, v8, a6
-; RV32V-NEXT:    addi a2, s1, -256
-; RV32V-NEXT:    vand.vx v20, v20, a2
-; RV32V-NEXT:    vand.vx v28, v8, a2
-; RV32V-NEXT:    vor.vv v8, v20, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vsll.vx v16, v28, a5
-; RV32V-NEXT:    vor.vv v8, v24, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vmv.v.v v20, v4
-; RV32V-NEXT:    vsrl.vx v24, v4, a6
-; RV32V-NEXT:    vsrl.vx v28, v4, a5
-; RV32V-NEXT:    vsll.vx v4, v4, a6
-; RV32V-NEXT:    vand.vx v28, v28, a2
-; RV32V-NEXT:    vor.vv v28, v28, v24
-; RV32V-NEXT:    vand.vx v24, v20, a2
-; RV32V-NEXT:    vsll.vx v24, v24, a5
-; RV32V-NEXT:    vor.vv v8, v4, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vmv4r.v v8, v0
-; RV32V-NEXT:    vsrl.vi v4, v0, 24
-; RV32V-NEXT:    lui a4, 4080
-; RV32V-NEXT:    vand.vx v4, v4, a4
-; RV32V-NEXT:    vsrl.vi v0, v0, 8
-; RV32V-NEXT:    vand.vv v0, v0, v12
-; RV32V-NEXT:    vor.vv v4, v0, v4
-; RV32V-NEXT:    vsrl.vi v0, v20, 24
-; RV32V-NEXT:    vand.vx v0, v0, a4
-; RV32V-NEXT:    vsrl.vi v16, v20, 8
-; RV32V-NEXT:    vand.vv v16, v16, v12
-; RV32V-NEXT:    vor.vv v16, v16, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vor.vv v24, v4, v24
-; RV32V-NEXT:    vand.vx v4, v8, a4
-; RV32V-NEXT:    vsll.vi v4, v4, 24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v12
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vor.vv v4, v4, v8
-; RV32V-NEXT:    lui a7, 61681
-; RV32V-NEXT:    addi a7, a7, -241
-; RV32V-NEXT:    vor.vv v8, v16, v28
-; RV32V-NEXT:    vand.vx v16, v20, a4
-; RV32V-NEXT:    vsll.vi v16, v16, 24
-; RV32V-NEXT:    vand.vv v12, v20, v12
-; RV32V-NEXT:    vsll.vi v12, v12, 8
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v0, a7
-; RV32V-NEXT:    lui a7, 209715
-; RV32V-NEXT:    addi a7, a7, 819
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vor.vv v28, v16, v4
-; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v4, a7
-; RV32V-NEXT:    lui a7, 349525
-; RV32V-NEXT:    addi a7, a7, 1365
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vor.vv v16, v16, v12
-; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
-; RV32V-NEXT:    vmv.v.x v12, a7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vsetvli a7, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vor.vv v12, v28, v24
-; RV32V-NEXT:    addi a7, sp, 280
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v12, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v0, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v12, v0
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vsll.vi v12, v12, 4
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vsrl.vi v16, v8, 4
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v12, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v12, v4
-; RV32V-NEXT:    vand.vv v16, v16, v4
-; RV32V-NEXT:    vsll.vi v12, v12, 2
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vsrl.vi v16, v8, 2
-; RV32V-NEXT:    vand.vv v8, v8, v4
-; RV32V-NEXT:    vand.vv v16, v16, v4
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v12, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v12, v12, v28
-; RV32V-NEXT:    vand.vv v16, v16, v28
-; RV32V-NEXT:    vadd.vv v12, v12, v12
-; RV32V-NEXT:    vor.vv v24, v16, v12
-; RV32V-NEXT:    vsrl.vi v12, v8, 1
-; RV32V-NEXT:    vand.vv v8, v8, v28
-; RV32V-NEXT:    vand.vv v12, v12, v28
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    addi a7, sp, 272
-; RV32V-NEXT:    vlse64.v v12, (a7), zero
-; RV32V-NEXT:    vand.vv v16, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a7, sp, 264
-; RV32V-NEXT:    addi t0, sp, 256
-; RV32V-NEXT:    addi a0, sp, 248
-; RV32V-NEXT:    vlse64.v v16, (a7), zero
-; RV32V-NEXT:    vlse64.v v20, (t0), zero
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a1, sp, 240
-; RV32V-NEXT:    addi a7, sp, 232
-; RV32V-NEXT:    addi t0, sp, 224
-; RV32V-NEXT:    addi a0, sp, 216
-; RV32V-NEXT:    vlse64.v v12, (a1), zero
-; RV32V-NEXT:    vlse64.v v16, (a7), zero
-; RV32V-NEXT:    vlse64.v v20, (t0), zero
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 208
-; RV32V-NEXT:    addi a1, sp, 200
-; RV32V-NEXT:    addi a7, sp, 192
-; RV32V-NEXT:    addi t0, sp, 184
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 176
-; RV32V-NEXT:    addi a1, sp, 168
-; RV32V-NEXT:    addi a7, sp, 160
-; RV32V-NEXT:    addi t0, sp, 152
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 144
-; RV32V-NEXT:    addi a1, sp, 136
-; RV32V-NEXT:    addi a7, sp, 128
-; RV32V-NEXT:    addi t0, sp, 120
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 112
-; RV32V-NEXT:    addi a1, sp, 104
-; RV32V-NEXT:    addi a7, sp, 96
-; RV32V-NEXT:    addi t0, sp, 88
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 80
-; RV32V-NEXT:    addi a1, sp, 72
-; RV32V-NEXT:    addi a7, sp, 64
-; RV32V-NEXT:    addi t0, sp, 56
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 48
-; RV32V-NEXT:    addi a1, sp, 40
-; RV32V-NEXT:    addi a7, sp, 32
-; RV32V-NEXT:    addi t0, sp, 24
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vlse64.v v16, (a1), zero
-; RV32V-NEXT:    vlse64.v v20, (a7), zero
-; RV32V-NEXT:    vlse64.v v28, (t0), zero
-; RV32V-NEXT:    vand.vv v12, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vv v12, v8, v28
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v28, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vand.vx v4, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v0, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, a3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, s11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, ra
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vx v12, v8, t3
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vand.vi v12, v8, 2
-; RV32V-NEXT:    vand.vi v16, v8, 1
-; RV32V-NEXT:    vand.vi v20, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v12, v24, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v12, v24, v16
-; RV32V-NEXT:    vmul.vv v16, v24, v20
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v20, v24, v8
-; RV32V-NEXT:    vmul.vv v28, v24, v28
-; RV32V-NEXT:    vmul.vv v4, v24, v4
-; RV32V-NEXT:    vmul.vv v0, v24, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v24, v8
-; RV32V-NEXT:    vxor.vi v8, v12, 0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vsrl.vx v12, v8, a6
-; RV32V-NEXT:    vsrl.vx v16, v8, a5
-; RV32V-NEXT:    vsrl.vi v20, v8, 24
-; RV32V-NEXT:    vand.vx v16, v16, a2
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vsrl.vi v16, v8, 8
-; RV32V-NEXT:    vand.vx v20, v20, a4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v16, v16, v24
-; RV32V-NEXT:    vor.vv v16, v16, v20
-; RV32V-NEXT:    vand.vx v20, v8, a4
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    vsll.vi v24, v24, 8
-; RV32V-NEXT:    vsll.vi v20, v20, 24
-; RV32V-NEXT:    vor.vv v20, v20, v24
-; RV32V-NEXT:    vsll.vx v24, v8, a6
-; RV32V-NEXT:    vand.vx v8, v8, a2
-; RV32V-NEXT:    vsll.vx v8, v8, a5
-; RV32V-NEXT:    vor.vv v8, v24, v8
-; RV32V-NEXT:    vor.vv v12, v16, v12
-; RV32V-NEXT:    vor.vv v8, v8, v20
-; RV32V-NEXT:    vor.vv v8, v8, v12
-; RV32V-NEXT:    vsrl.vi v12, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v12, v12, v16
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vsrl.vi v12, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v12, v12, v16
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vsrl.vi v12, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v12, v12, v16
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v12, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    addi sp, sp, 368
-; RV32V-NEXT:    ret
-;
-; RV64V-LABEL: clmulh_nxv4i64_vx:
-; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi sp, sp, -16
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    sub sp, sp, a1
-; RV64V-NEXT:    li a1, 56
-; RV64V-NEXT:    lui t2, 16
-; RV64V-NEXT:    lui a2, 4080
-; RV64V-NEXT:    li t0, 255
-; RV64V-NEXT:    lui a3, 61681
-; RV64V-NEXT:    lui a4, 209715
-; RV64V-NEXT:    lui a5, 349525
-; RV64V-NEXT:    srli a6, a0, 24
-; RV64V-NEXT:    srli a7, a0, 8
-; RV64V-NEXT:    srli t1, a0, 40
-; RV64V-NEXT:    srli t3, a0, 56
-; RV64V-NEXT:    addi a3, a3, -241
-; RV64V-NEXT:    addi a4, a4, 819
-; RV64V-NEXT:    addi t4, a5, 1365
-; RV64V-NEXT:    slli a5, a3, 32
-; RV64V-NEXT:    add a5, a3, a5
-; RV64V-NEXT:    slli a3, a4, 32
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, t4, 32
-; RV64V-NEXT:    add a3, t4, a3
-; RV64V-NEXT:    srliw t4, a0, 24
-; RV64V-NEXT:    slli t0, t0, 24
-; RV64V-NEXT:    and a6, a6, a2
-; RV64V-NEXT:    and a7, a7, t0
-; RV64V-NEXT:    or t5, a7, a6
-; RV64V-NEXT:    addi a6, t2, -256
-; RV64V-NEXT:    and a7, t1, a6
-; RV64V-NEXT:    or t1, a7, t3
-; RV64V-NEXT:    and a7, a0, a2
-; RV64V-NEXT:    slli t4, t4, 32
-; RV64V-NEXT:    slli a7, a7, 24
-; RV64V-NEXT:    or t3, a7, t4
-; RV64V-NEXT:    li a7, 40
-; RV64V-NEXT:    vsetvli t4, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vsrl.vi v16, v8, 24
-; RV64V-NEXT:    vsrl.vi v12, v8, 8
-; RV64V-NEXT:    or t1, t5, t1
-; RV64V-NEXT:    slli t4, a0, 56
-; RV64V-NEXT:    and a0, a0, a6
-; RV64V-NEXT:    slli a0, a0, 40
-; RV64V-NEXT:    or t4, t4, a0
-; RV64V-NEXT:    li a0, 1
-; RV64V-NEXT:    or t4, t4, t3
-; RV64V-NEXT:    lui t3, 1
-; RV64V-NEXT:    vsrl.vx v20, v8, a1
-; RV64V-NEXT:    vsrl.vx v24, v8, a7
-; RV64V-NEXT:    vand.vx v16, v16, a2
-; RV64V-NEXT:    vand.vx v28, v8, a2
-; RV64V-NEXT:    vsll.vx v4, v8, a1
-; RV64V-NEXT:    vand.vx v24, v24, a6
-; RV64V-NEXT:    vand.vx v12, v12, t0
-; RV64V-NEXT:    vsll.vi v28, v28, 24
-; RV64V-NEXT:    vor.vv v20, v24, v20
-; RV64V-NEXT:    vand.vx v24, v8, t0
-; RV64V-NEXT:    vand.vx v8, v8, a6
-; RV64V-NEXT:    vor.vv v12, v12, v16
-; RV64V-NEXT:    vsll.vi v16, v24, 8
-; RV64V-NEXT:    vsll.vx v8, v8, a7
-; RV64V-NEXT:    vor.vv v12, v12, v20
-; RV64V-NEXT:    vor.vv v16, v28, v16
-; RV64V-NEXT:    vor.vv v8, v4, v8
-; RV64V-NEXT:    vor.vv v8, v8, v16
-; RV64V-NEXT:    vor.vv v8, v8, v12
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v12, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, a5
-; RV64V-NEXT:    srli t4, t1, 4
-; RV64V-NEXT:    and t1, t1, a5
-; RV64V-NEXT:    vand.vx v12, v12, a5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    and t4, t4, a5
-; RV64V-NEXT:    slli t1, t1, 4
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v12, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, a4
-; RV64V-NEXT:    srli t4, t1, 2
-; RV64V-NEXT:    and t1, t1, a4
-; RV64V-NEXT:    vand.vx v12, v12, a4
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    and t4, t4, a4
-; RV64V-NEXT:    slli t1, t1, 2
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v12, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    srli t4, t1, 1
-; RV64V-NEXT:    and t1, t1, a3
-; RV64V-NEXT:    vand.vx v12, v12, a3
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    and t4, t4, a3
-; RV64V-NEXT:    slli t1, t1, 1
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    andi t4, t1, 2
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    andi t4, t1, 1
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    andi t4, t1, 4
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    andi t4, t1, 8
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    andi t4, t1, 16
-; RV64V-NEXT:    vmul.vx v28, v8, t4
-; RV64V-NEXT:    andi t4, t1, 32
-; RV64V-NEXT:    vmul.vx v4, v8, t4
-; RV64V-NEXT:    andi t4, t1, 64
-; RV64V-NEXT:    vxor.vv v12, v16, v12
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    andi t4, t1, 128
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    andi t4, t1, 256
-; RV64V-NEXT:    vxor.vv v12, v12, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    andi t4, t1, 512
-; RV64V-NEXT:    vxor.vv v12, v12, v28
-; RV64V-NEXT:    vmul.vx v28, v8, t4
-; RV64V-NEXT:    andi t4, t1, 1024
-; RV64V-NEXT:    vxor.vv v4, v12, v4
-; RV64V-NEXT:    vmul.vx v12, v8, t4
-; RV64V-NEXT:    slli t4, a0, 11
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v4, v4, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    lui t4, 2
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v4, v4, v20
-; RV64V-NEXT:    addi t5, sp, 16
-; RV64V-NEXT:    vs4r.v v4, (t5) # vscale x 32-byte Folded Spill
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    lui t3, 4
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v24, v4, v24
-; RV64V-NEXT:    vxor.vv v28, v24, v28
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    lui t4, 8
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v28, v12
-; RV64V-NEXT:    vmul.vx v28, v8, t3
-; RV64V-NEXT:    lui t3, 32
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v16
-; RV64V-NEXT:    vmul.vx v4, v8, t4
-; RV64V-NEXT:    lui t4, 64
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v0, v8, t2
-; RV64V-NEXT:    lui t2, 128
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v12, v24
-; RV64V-NEXT:    vmul.vx v12, v8, t3
-; RV64V-NEXT:    lui t3, 256
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v20, v16, v28
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    lui t4, 512
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v20, v4
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    lui t2, 1024
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v28, v24, v0
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    lui t3, 2048
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v28, v12
-; RV64V-NEXT:    vmul.vx v28, v8, t4
-; RV64V-NEXT:    lui t4, 4096
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v12, v16
-; RV64V-NEXT:    vmul.vx v12, v8, t2
-; RV64V-NEXT:    lui t2, 8192
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    lui t3, 16384
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    lui t4, 32768
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v28
-; RV64V-NEXT:    vmul.vx v28, v8, t2
-; RV64V-NEXT:    lui t2, 65536
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v12, v16, v12
-; RV64V-NEXT:    vmul.vx v16, v8, t3
-; RV64V-NEXT:    lui t3, 131072
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v12, v12, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    lui t4, 262144
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v12, v24
-; RV64V-NEXT:    vxor.vv v4, v24, v28
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    slli t2, a0, 32
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vl4r.v v28, (t5) # vscale x 32-byte Folded Reload
-; RV64V-NEXT:    vsll.vx v28, v28, a1
-; RV64V-NEXT:    vand.vx v12, v12, a6
-; RV64V-NEXT:    vsll.vx v12, v12, a7
-; RV64V-NEXT:    vor.vv v12, v28, v12
-; RV64V-NEXT:    vmul.vx v28, v8, t3
-; RV64V-NEXT:    slli t3, a0, 33
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v4, v16
-; RV64V-NEXT:    vmul.vx v4, v8, t4
-; RV64V-NEXT:    slli t4, a0, 34
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    slli t2, a0, 35
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    slli t3, a0, 36
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v28, v16, v28
-; RV64V-NEXT:    vmul.vx v16, v8, t4
-; RV64V-NEXT:    srliw t4, t1, 31
-; RV64V-NEXT:    slli t4, t4, 31
-; RV64V-NEXT:    vxor.vv v28, v28, v4
-; RV64V-NEXT:    vmul.vx v4, v8, t4
-; RV64V-NEXT:    slli t4, a0, 37
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v4, v28, v4
-; RV64V-NEXT:    vmul.vx v28, v8, t2
-; RV64V-NEXT:    slli t2, a0, 38
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v4, v4, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t3
-; RV64V-NEXT:    slli t3, a0, 39
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v4, v4, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    slli t4, a0, 40
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v4, v4, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t2
-; RV64V-NEXT:    slli t2, a0, 41
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v4, v4, v28
-; RV64V-NEXT:    vmul.vx v28, v8, t3
-; RV64V-NEXT:    slli t3, a0, 42
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v4, v4, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t4
-; RV64V-NEXT:    slli t4, a0, 43
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v4, v4, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    slli t2, a0, 44
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v4, v16
-; RV64V-NEXT:    vmul.vx v4, v8, t3
-; RV64V-NEXT:    slli t3, a0, 45
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v28
-; RV64V-NEXT:    vmul.vx v28, v8, t4
-; RV64V-NEXT:    slli t4, a0, 46
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v20, v16, v20
-; RV64V-NEXT:    vxor.vv v24, v20, v24
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    slli t2, a0, 47
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v4, v24, v4
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    slli t3, a0, 48
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v4, v4, v28
-; RV64V-NEXT:    vmul.vx v28, v8, t4
-; RV64V-NEXT:    slli t4, a0, 49
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v4, v4, v20
-; RV64V-NEXT:    vmul.vx v20, v8, t2
-; RV64V-NEXT:    slli t2, a0, 50
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v4, v4, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    slli t3, a0, 51
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v28, v4, v28
-; RV64V-NEXT:    vmul.vx v4, v8, t4
-; RV64V-NEXT:    slli t4, a0, 52
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v20, v28, v20
-; RV64V-NEXT:    vmul.vx v28, v8, t2
-; RV64V-NEXT:    slli t2, a0, 53
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v24, v20, v24
-; RV64V-NEXT:    vxor.vv v4, v24, v4
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    slli t3, a0, 54
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v28, v4, v28
-; RV64V-NEXT:    vsrl.vi v4, v16, 8
-; RV64V-NEXT:    vand.vx v4, v4, t0
-; RV64V-NEXT:    vsrl.vi v20, v20, 24
-; RV64V-NEXT:    vand.vx v20, v20, a2
-; RV64V-NEXT:    vor.vv v20, v4, v20
-; RV64V-NEXT:    vmul.vx v4, v8, t4
-; RV64V-NEXT:    slli t4, a0, 55
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v28, v24
-; RV64V-NEXT:    vmul.vx v28, v8, t2
-; RV64V-NEXT:    slli t2, a0, 56
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v24, v24, v4
-; RV64V-NEXT:    vmul.vx v4, v8, t3
-; RV64V-NEXT:    slli t3, a0, 57
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v24, v24, v28
-; RV64V-NEXT:    vmul.vx v0, v8, t4
-; RV64V-NEXT:    slli t4, a0, 58
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v24, v4
-; RV64V-NEXT:    vmul.vx v28, v8, t2
-; RV64V-NEXT:    slli t2, a0, 59
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v4, v8, t3
-; RV64V-NEXT:    slli t3, a0, 60
-; RV64V-NEXT:    vand.vx v16, v16, a2
-; RV64V-NEXT:    slli a2, a0, 61
-; RV64V-NEXT:    slli a0, a0, 62
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    and a2, t1, a2
-; RV64V-NEXT:    and a0, t1, a0
-; RV64V-NEXT:    srli t1, t1, 63
-; RV64V-NEXT:    vsll.vi v16, v16, 24
-; RV64V-NEXT:    vxor.vv v28, v24, v28
-; RV64V-NEXT:    vxor.vv v28, v28, v4
-; RV64V-NEXT:    vand.vx v4, v24, t0
-; RV64V-NEXT:    vsll.vi v4, v4, 8
-; RV64V-NEXT:    vor.vv v16, v16, v4
-; RV64V-NEXT:    vmul.vx v4, v8, t4
-; RV64V-NEXT:    vxor.vv v28, v28, v4
-; RV64V-NEXT:    vmul.vx v4, v8, t2
-; RV64V-NEXT:    vor.vv v12, v12, v16
-; RV64V-NEXT:    vmul.vx v16, v8, t3
-; RV64V-NEXT:    vxor.vv v28, v28, v4
-; RV64V-NEXT:    vmul.vx v4, v8, a2
-; RV64V-NEXT:    vxor.vv v16, v28, v16
-; RV64V-NEXT:    vmul.vx v28, v8, a0
-; RV64V-NEXT:    slli t1, t1, 63
-; RV64V-NEXT:    vmul.vx v8, v8, t1
-; RV64V-NEXT:    vsrl.vx v24, v24, a7
-; RV64V-NEXT:    vand.vx v24, v24, a6
-; RV64V-NEXT:    vxor.vv v16, v16, v4
-; RV64V-NEXT:    vxor.vv v16, v16, v28
-; RV64V-NEXT:    vxor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vx v8, v8, a1
-; RV64V-NEXT:    vor.vv v8, v24, v8
-; RV64V-NEXT:    vor.vv v8, v20, v8
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, a5
-; RV64V-NEXT:    vand.vx v12, v12, a5
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, a4
-; RV64V-NEXT:    vand.vx v12, v12, a4
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v12, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    vand.vx v12, v12, a3
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v12, v8
-; RV64V-NEXT:    vsrl.vi v8, v8, 1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add sp, sp, a0
-; RV64V-NEXT:    addi sp, sp, 16
-; RV64V-NEXT:    ret
-;
-; RV32ZVBC-LABEL: clmulh_nxv4i64_vx:
-; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    addi sp, sp, -16
-; RV32ZVBC-NEXT:    sw a0, 8(sp)
-; RV32ZVBC-NEXT:    sw a1, 12(sp)
-; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v12, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v12
-; RV32ZVBC-NEXT:    addi sp, sp, 16
-; RV32ZVBC-NEXT:    ret
-;
-; RV64ZVBC-LABEL: clmulh_nxv4i64_vx:
-; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
-; RV64ZVBC-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i128 0
-  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-  %va.ext = zext <vscale x 4 x i64> %va to <vscale x 4 x i128>
-  %vb.ext = zext <vscale x 4 x i64> %vb to <vscale x 4 x i128>
-  %clmul = call <vscale x 4 x i128> @llvm.clmul.nxv4i128(<vscale x 4 x i128> %va.ext, <vscale x 4 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 4 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 4 x i128> %res.ext to <vscale x 4 x i64>
-  ret <vscale x 4 x i64> %res
-}
-
-define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) nounwind {
-; RV32V-LABEL: clmulh_nxv8i64_vv:
-; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui t6, 16
-; RV32V-NEXT:    li t5, 56
-; RV32V-NEXT:    li t4, 40
-; RV32V-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vsrl.vx v24, v8, t5
-; RV32V-NEXT:    vsrl.vx v0, v8, t4
-; RV32V-NEXT:    addi t3, t6, -256
-; RV32V-NEXT:    vand.vx v0, v0, t3
-; RV32V-NEXT:    vor.vv v24, v0, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsrl.vx v24, v16, t4
-; RV32V-NEXT:    vand.vx v24, v24, t3
-; RV32V-NEXT:    vsrl.vx v0, v16, t5
-; RV32V-NEXT:    vor.vv v24, v24, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, t3
-; RV32V-NEXT:    vsll.vx v24, v24, t4
-; RV32V-NEXT:    vsll.vx v0, v8, t5
-; RV32V-NEXT:    vor.vv v8, v0, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v8, v16, t3
-; RV32V-NEXT:    vsll.vx v8, v8, t4
-; RV32V-NEXT:    vsll.vx v0, v16, t5
-; RV32V-NEXT:    vor.vv v8, v0, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a5, 1044480
-; RV32V-NEXT:    lui a4, 524288
-; RV32V-NEXT:    li ra, 1
-; RV32V-NEXT:    li a6, 2
-; RV32V-NEXT:    li a7, 4
-; RV32V-NEXT:    li s0, 8
-; RV32V-NEXT:    li s11, 16
-; RV32V-NEXT:    li s10, 32
-; RV32V-NEXT:    li s9, 64
-; RV32V-NEXT:    li s8, 128
-; RV32V-NEXT:    li s7, 256
-; RV32V-NEXT:    li s6, 512
-; RV32V-NEXT:    li s5, 1024
-; RV32V-NEXT:    lui s4, 1
-; RV32V-NEXT:    lui s3, 2
-; RV32V-NEXT:    lui s2, 4
-; RV32V-NEXT:    lui s1, 8
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    lui a1, 64
-; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    lui a3, 256
-; RV32V-NEXT:    lui t1, 512
-; RV32V-NEXT:    lui t0, 1024
-; RV32V-NEXT:    lui t2, 2048
-; RV32V-NEXT:    sw a5, 272(sp)
-; RV32V-NEXT:    lui a5, 4096
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw a4, 264(sp)
-; RV32V-NEXT:    sw zero, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw ra, 260(sp)
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw a6, 252(sp)
-; RV32V-NEXT:    lui a6, 8192
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw a7, 244(sp)
-; RV32V-NEXT:    lui a7, 16384
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s0, 236(sp)
-; RV32V-NEXT:    lui s0, 32768
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s11, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s10, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s9, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s8, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s7, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s6, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s5, 180(sp)
-; RV32V-NEXT:    slli ra, ra, 11
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw ra, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s4, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s3, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw s2, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw s1, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t6, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw a0, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw a1, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw a2, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw a3, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t1, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw t0, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw t2, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a5, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw a6, 60(sp)
-; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a7, 52(sp)
-; RV32V-NEXT:    lui t2, 16384
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s0, 44(sp)
-; RV32V-NEXT:    lui a7, 65536
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a7, 36(sp)
-; RV32V-NEXT:    lui a6, 131072
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a6, 28(sp)
-; RV32V-NEXT:    lui a5, 262144
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw a5, 20(sp)
-; RV32V-NEXT:    sw a4, 12(sp)
-; RV32V-NEXT:    sw zero, 8(sp)
-; RV32V-NEXT:    lui a3, 4080
-; RV32V-NEXT:    addi t0, sp, 272
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vsrl.vi v8, v8, 24
-; RV32V-NEXT:    vand.vx v8, v8, a3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vsrl.vi v0, v0, 8
-; RV32V-NEXT:    vand.vv v0, v0, v24
-; RV32V-NEXT:    vor.vv v8, v0, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsrl.vi v0, v16, 24
-; RV32V-NEXT:    vand.vx v0, v0, a3
-; RV32V-NEXT:    vsrl.vi v24, v16, 8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v24, v16
-; RV32V-NEXT:    vor.vv v24, v24, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v8, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vx v0, v16, a3
-; RV32V-NEXT:    vsll.vi v0, v0, 24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v16, v16, v8
-; RV32V-NEXT:    vsll.vi v16, v16, 8
-; RV32V-NEXT:    vor.vv v0, v0, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vx v24, v8, a3
-; RV32V-NEXT:    vsll.vi v24, v24, 24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v16, v8
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vor.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v24, v24, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v8, v0, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v24, v24, v0
-; RV32V-NEXT:    lui t0, 61681
-; RV32V-NEXT:    addi t0, t0, -241
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v16, v8, v16
-; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
-; RV32V-NEXT:    vmv.v.x v0, t0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vsrl.vi v8, v24, 4
-; RV32V-NEXT:    vand.vv v24, v24, v0
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vsll.vi v24, v24, 4
-; RV32V-NEXT:    vor.vv v8, v8, v24
-; RV32V-NEXT:    vsrl.vi v24, v16, 4
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vand.vv v24, v24, v0
-; RV32V-NEXT:    vsll.vi v16, v16, 4
-; RV32V-NEXT:    vor.vv v16, v24, v16
-; RV32V-NEXT:    lui t0, 209715
-; RV32V-NEXT:    addi t0, t0, 819
-; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
-; RV32V-NEXT:    vmv.v.x v0, t0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vsrl.vi v24, v8, 2
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v24, v24, v0
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v24, v8
-; RV32V-NEXT:    vsrl.vi v24, v16, 2
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vand.vv v24, v24, v0
-; RV32V-NEXT:    vsll.vi v16, v16, 2
-; RV32V-NEXT:    vor.vv v24, v24, v16
-; RV32V-NEXT:    lui t0, 349525
-; RV32V-NEXT:    addi t0, t0, 1365
-; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
-; RV32V-NEXT:    vmv.v.x v0, t0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vsrl.vi v16, v8, 1
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v16, v16, v8
-; RV32V-NEXT:    vsrl.vi v8, v24, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi t0, sp, 264
-; RV32V-NEXT:    vlse64.v v0, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vadd.vv v24, v24, v24
-; RV32V-NEXT:    vor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi t0, sp, 256
-; RV32V-NEXT:    addi t1, sp, 248
-; RV32V-NEXT:    addi a1, sp, 240
-; RV32V-NEXT:    addi a0, sp, 232
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a4, vlenb
-; RV32V-NEXT:    slli a4, a4, 3
-; RV32V-NEXT:    mv t0, a4
-; RV32V-NEXT:    slli a4, a4, 2
-; RV32V-NEXT:    add t0, t0, a4
-; RV32V-NEXT:    slli a4, a4, 1
-; RV32V-NEXT:    add t0, t0, a4
-; RV32V-NEXT:    slli a4, a4, 1
-; RV32V-NEXT:    add t0, t0, a4
-; RV32V-NEXT:    slli a4, a4, 1
-; RV32V-NEXT:    add a4, a4, t0
-; RV32V-NEXT:    add a4, sp, a4
-; RV32V-NEXT:    addi a4, a4, 288
-; RV32V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (t1), zero
-; RV32V-NEXT:    vlse64.v v24, (a1), zero
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a4, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a4, a4, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a4, a4, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a4, a4, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a4
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 288
-; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 224
-; RV32V-NEXT:    addi a1, sp, 216
-; RV32V-NEXT:    addi t0, sp, 208
-; RV32V-NEXT:    addi t1, sp, 200
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a4, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 192
-; RV32V-NEXT:    addi a1, sp, 184
-; RV32V-NEXT:    addi t0, sp, 176
-; RV32V-NEXT:    addi t1, sp, 168
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a4, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 160
-; RV32V-NEXT:    addi a1, sp, 152
-; RV32V-NEXT:    addi t0, sp, 144
-; RV32V-NEXT:    addi t1, sp, 136
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a4, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 128
-; RV32V-NEXT:    addi a1, sp, 120
-; RV32V-NEXT:    addi t0, sp, 112
-; RV32V-NEXT:    addi t1, sp, 104
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a4, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 96
-; RV32V-NEXT:    addi a1, sp, 88
-; RV32V-NEXT:    addi t0, sp, 80
-; RV32V-NEXT:    addi t1, sp, 72
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a4, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 64
-; RV32V-NEXT:    addi a1, sp, 56
-; RV32V-NEXT:    addi t0, sp, 48
-; RV32V-NEXT:    addi t1, sp, 40
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a4, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a4, a4, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 32
-; RV32V-NEXT:    addi a1, sp, 24
-; RV32V-NEXT:    addi t0, sp, 16
-; RV32V-NEXT:    addi t1, sp, 8
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a4, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s5
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, ra
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, t6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 64
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 128
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 256
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 512
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 1024
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 2048
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 4096
-; RV32V-NEXT:    vand.vx v24, v8, a0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, t2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a7
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a6
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a5
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vi v24, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vi v0, v8, 1
-; RV32V-NEXT:    vand.vi v24, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v24, v16, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v0, v16, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v0, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v16, v8
-; RV32V-NEXT:    vxor.vi v8, v24, 0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vsrl.vx v16, v8, t4
-; RV32V-NEXT:    vand.vx v16, v16, t3
-; RV32V-NEXT:    vsrl.vx v24, v8, t5
-; RV32V-NEXT:    vor.vv v16, v16, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsrl.vi v24, v8, 24
-; RV32V-NEXT:    vand.vx v24, v24, a3
-; RV32V-NEXT:    vsrl.vi v0, v8, 8
+define <vscale x 16 x i32> @clmulh_nxv16i32_vv(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb) nounwind {
+; RV32-LABEL: clmulh_nxv16i32_vv:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    lui a0, 16
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    addi a0, a0, -256
+; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v16, 8
+; RV32-NEXT:    vand.vx v0, v0, a0
+; RV32-NEXT:    vsrl.vi v24, v16, 24
+; RV32-NEXT:    vor.vv v0, v0, v24
+; RV32-NEXT:    vsll.vi v24, v8, 24
+; RV32-NEXT:    vand.vx v8, v8, a0
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vsll.vi v24, v16, 24
+; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui t6, 349525
+; RV32-NEXT:    li s5, 16
+; RV32-NEXT:    li t2, 32
+; RV32-NEXT:    li a7, 256
+; RV32-NEXT:    li t0, 512
+; RV32-NEXT:    li t1, 1024
+; RV32-NEXT:    li s6, 1
+; RV32-NEXT:    lui t3, 1
+; RV32-NEXT:    lui t4, 2
+; RV32-NEXT:    lui t5, 4
+; RV32-NEXT:    lui s0, 8
+; RV32-NEXT:    lui s1, 32
+; RV32-NEXT:    lui s2, 64
+; RV32-NEXT:    lui s3, 128
+; RV32-NEXT:    lui s4, 256
+; RV32-NEXT:    lui s7, 512
+; RV32-NEXT:    lui s8, 1024
+; RV32-NEXT:    lui s9, 2048
+; RV32-NEXT:    lui s10, 4096
+; RV32-NEXT:    lui s11, 8192
+; RV32-NEXT:    lui ra, 16384
+; RV32-NEXT:    lui a5, 32768
+; RV32-NEXT:    addi a3, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a1, t6, 1365
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vand.vx v16, v16, a3
+; RV32-NEXT:    vand.vx v24, v24, a3
+; RV32-NEXT:    vsll.vi v16, v16, 4
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 2
+; RV32-NEXT:    vand.vx v16, v16, a2
+; RV32-NEXT:    vand.vx v24, v24, a2
+; RV32-NEXT:    vsll.vi v16, v16, 2
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vadd.vv v16, v16, v16
+; RV32-NEXT:    vor.vv v0, v24, v16
+; RV32-NEXT:    vand.vx v16, v0, s5
+; RV32-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    lui t6, 65536
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a4, a4, a0
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vand.vx v16, v16, a3
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vx v8, v8, a2
+; RV32-NEXT:    vand.vx v16, v16, a2
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vand.vx v16, v0, t2
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a4, a4, a0
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    lui a4, 131072
+; RV32-NEXT:    slli s6, s6, 11
+; RV32-NEXT:    lui t2, 262144
+; RV32-NEXT:    lui s5, 524288
+; RV32-NEXT:    li a6, 64
+; RV32-NEXT:    vand.vx v16, v0, a6
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a0, a6
+; RV32-NEXT:    slli a6, a6, 5
+; RV32-NEXT:    add a6, a6, a0
+; RV32-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a6, 128
+; RV32-NEXT:    vand.vx v16, v0, a6
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 8
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, a7
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, t0
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, t1
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s6
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 5
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, t3
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, t4
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, t5
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s0
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 6
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    vand.vx v16, v0, a6
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s1
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s2
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s3
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 5
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s4
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s7
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s8
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s9
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s10
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s11
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, ra
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    mv a7, a6
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, a5
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 5
+; RV32-NEXT:    mv a6, a5
+; RV32-NEXT:    slli a5, a5, 1
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, t6
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    mv a6, a5
+; RV32-NEXT:    slli a5, a5, 1
+; RV32-NEXT:    add a6, a6, a5
+; RV32-NEXT:    slli a5, a5, 2
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, a4
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vi v16, v0, 2
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vi v24, v0, 1
+; RV32-NEXT:    vand.vi v16, v0, 4
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vi v16, v0, 8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, t2
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 6
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vand.vx v16, v0, s5
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vmul.vv v16, v8, v24
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v24, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v0, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 8
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 6
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 6
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 7
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 7
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 8
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 6
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v16, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v16, v16, v8
+; RV32-NEXT:    vxor.vv v16, v16, v24
+; RV32-NEXT:    vxor.vv v16, v16, v0
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v16, v16, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v16, v16, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v16, v16, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v16, v16, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v16, v8
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 6
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 7
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v24, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v0, v24, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v0, v0, v8
+; RV32-NEXT:    vsll.vi v16, v16, 24
+; RV32-NEXT:    vand.vx v8, v24, a0
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v16, v16, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v8, v0, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v8, v8, v0
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v8, v8, v0
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 8
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v8, v8, v0
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v8, v8, v0
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    mv a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a5, a5, a4
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v24, v24, 8
+; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vsrl.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vand.vx v16, v16, a3
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vx v8, v8, a2
+; RV32-NEXT:    vand.vx v16, v16, a2
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v8, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: clmulh_nxv16i32_vv:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -144
+; RV64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    lui a0, 16
+; RV64-NEXT:    vsrl.vi v0, v8, 24
+; RV64-NEXT:    addi a0, a0, -256
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v24, v24, v0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a2, a2, a1
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v16, 8
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vsrl.vi v24, v16, 24
+; RV64-NEXT:    vor.vv v0, v0, v24
+; RV64-NEXT:    vsll.vi v24, v8, 24
+; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vsll.vi v8, v8, 8
+; RV64-NEXT:    vor.vv v8, v24, v8
+; RV64-NEXT:    vsll.vi v24, v16, 24
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui t6, 349525
+; RV64-NEXT:    li s5, 16
+; RV64-NEXT:    li t2, 32
+; RV64-NEXT:    li a7, 256
+; RV64-NEXT:    li t0, 512
+; RV64-NEXT:    li t1, 1024
+; RV64-NEXT:    li s6, 1
+; RV64-NEXT:    lui t3, 1
+; RV64-NEXT:    lui t4, 2
+; RV64-NEXT:    lui t5, 4
+; RV64-NEXT:    lui s0, 8
+; RV64-NEXT:    lui s1, 32
+; RV64-NEXT:    lui s2, 64
+; RV64-NEXT:    lui s3, 128
+; RV64-NEXT:    lui s4, 256
+; RV64-NEXT:    lui s7, 512
+; RV64-NEXT:    lui s8, 1024
+; RV64-NEXT:    lui s9, 2048
+; RV64-NEXT:    lui s10, 4096
+; RV64-NEXT:    lui s11, 8192
+; RV64-NEXT:    lui ra, 16384
+; RV64-NEXT:    lui a5, 32768
+; RV64-NEXT:    addi a3, a1, -241
+; RV64-NEXT:    addi a2, a2, 819
+; RV64-NEXT:    addi a1, t6, 1365
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsll.vi v16, v16, 4
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsll.vi v16, v16, 2
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 1
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vor.vv v0, v24, v16
+; RV64-NEXT:    vand.vx v16, v0, s5
+; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    mv a4, a0
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    lui t6, 65536
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a4, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a4, a4, a0
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsll.vi v8, v8, 4
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vsll.vi v8, v8, 2
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vand.vx v16, v0, t2
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a4, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a4, a4, a0
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    lui a4, 131072
+; RV64-NEXT:    slli s6, s6, 11
+; RV64-NEXT:    lui t2, 262144
+; RV64-NEXT:    lui s5, 524288
+; RV64-NEXT:    li a6, 64
+; RV64-NEXT:    vand.vx v16, v0, a6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a0, a6
+; RV64-NEXT:    slli a6, a6, 5
+; RV64-NEXT:    add a6, a6, a0
+; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a6, 128
+; RV64-NEXT:    vand.vx v16, v0, a6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 8
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, a7
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t0
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t1
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 5
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t3
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t4
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t5
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s0
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 6
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    lui a6, 16
+; RV64-NEXT:    vand.vx v16, v0, a6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s1
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s2
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s3
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 5
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s4
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s7
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s8
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s9
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s10
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s11
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, ra
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, a5
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 5
+; RV64-NEXT:    mv a6, a5
+; RV64-NEXT:    slli a5, a5, 1
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 32
+; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t6
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 3
+; RV64-NEXT:    mv a6, a5
+; RV64-NEXT:    slli a5, a5, 1
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a5, 2
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 32
+; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, a4
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vi v16, v0, 2
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vi v24, v0, 1
+; RV64-NEXT:    vand.vi v16, v0, 4
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vi v16, v0, 8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t2
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s5
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vmul.vv v16, v8, v24
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v24, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v0, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 8
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    addi a4, sp, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 7
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 7
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 8
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    vxor.vv v16, v16, v24
+; RV64-NEXT:    vxor.vv v16, v16, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v16, v8
+; RV64-NEXT:    addi a4, sp, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 7
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v0, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v0, v0, v8
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v8, v24, a0
+; RV64-NEXT:    vsll.vi v8, v8, 8
+; RV64-NEXT:    vor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v0, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 8
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    vsrl.vi v24, v24, 8
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vsrl.vi v8, v8, 24
+; RV64-NEXT:    vor.vv v8, v24, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsll.vi v8, v8, 4
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vsll.vi v8, v8, 2
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v8, v8, 1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 144
+; RV64-NEXT:    ret
+  %va.ext = zext <vscale x 16 x i32> %va to <vscale x 16 x i64>
+  %vb.ext = zext <vscale x 16 x i32> %vb to <vscale x 16 x i64>
+  %clmul = call <vscale x 16 x i64> @llvm.clmul.nxv16i64(<vscale x 16 x i64> %va.ext, <vscale x 16 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 16 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 16 x i64> %res.ext to <vscale x 16 x i32>
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x i32> @clmulh_nxv16i32_vx(<vscale x 16 x i32> %va, i32 %b) nounwind {
+; RV32-LABEL: clmulh_nxv16i32_vx:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 8
+; RV32-NEXT:    lui a4, 16
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    vsll.vi v0, v8, 24
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a5, 209715
+; RV32-NEXT:    lui a7, 349525
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    srli a6, a0, 24
+; RV32-NEXT:    addi a1, a4, -256
+; RV32-NEXT:    and a3, a3, a1
+; RV32-NEXT:    or t0, a3, a6
+; RV32-NEXT:    slli a3, a0, 24
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 8
+; RV32-NEXT:    or t1, a3, a0
+; RV32-NEXT:    li a6, 1
+; RV32-NEXT:    addi a3, a2, -241
+; RV32-NEXT:    addi a2, a5, 819
+; RV32-NEXT:    addi a0, a7, 1365
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v0, v8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    or a5, t1, t0
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    srli a7, a5, 4
+; RV32-NEXT:    and a5, a5, a3
+; RV32-NEXT:    vand.vx v16, v16, a3
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    and a7, a7, a3
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    or a5, a7, a5
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vx v8, v8, a2
+; RV32-NEXT:    srli a7, a5, 2
+; RV32-NEXT:    and a5, a5, a2
+; RV32-NEXT:    vand.vx v16, v16, a2
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    and a7, a7, a2
+; RV32-NEXT:    slli a5, a5, 2
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    or a5, a7, a5
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vx v8, v8, a0
+; RV32-NEXT:    srli a7, a5, 1
+; RV32-NEXT:    and a5, a5, a0
+; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    and a7, a7, a0
+; RV32-NEXT:    slli a5, a5, 1
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    or a5, a7, a5
+; RV32-NEXT:    andi a7, a5, 2
+; RV32-NEXT:    vmul.vx v16, v8, a7
+; RV32-NEXT:    andi a7, a5, 1
+; RV32-NEXT:    vmul.vx v24, v8, a7
+; RV32-NEXT:    andi a7, a5, 4
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    andi a7, a5, 8
+; RV32-NEXT:    vxor.vv v16, v24, v16
+; RV32-NEXT:    vmul.vx v24, v8, a7
+; RV32-NEXT:    andi a7, a5, 16
+; RV32-NEXT:    vxor.vv v16, v16, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    andi a7, a5, 32
+; RV32-NEXT:    vxor.vv v16, v16, v24
+; RV32-NEXT:    vmul.vx v24, v8, a7
+; RV32-NEXT:    andi a7, a5, 64
+; RV32-NEXT:    vxor.vv v16, v16, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    andi a7, a5, 128
+; RV32-NEXT:    vxor.vv v16, v16, v24
+; RV32-NEXT:    vmul.vx v24, v8, a7
+; RV32-NEXT:    andi a7, a5, 256
+; RV32-NEXT:    vxor.vv v16, v16, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    andi a7, a5, 512
+; RV32-NEXT:    vxor.vv v16, v16, v24
+; RV32-NEXT:    csrr t0, vlenb
+; RV32-NEXT:    slli t0, t0, 3
+; RV32-NEXT:    add t0, sp, t0
+; RV32-NEXT:    addi t0, t0, 16
+; RV32-NEXT:    vs8r.v v16, (t0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vmul.vx v24, v8, a7
+; RV32-NEXT:    andi a7, a5, 1024
+; RV32-NEXT:    vxor.vv v0, v16, v0
+; RV32-NEXT:    vxor.vv v24, v0, v24
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    lui a7, 1
+; RV32-NEXT:    slli a6, a6, 11
+; RV32-NEXT:    and a6, a5, a6
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a6
+; RV32-NEXT:    lui a6, 2
+; RV32-NEXT:    and a7, a5, a7
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    lui a7, 4
+; RV32-NEXT:    and a6, a5, a6
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a6
+; RV32-NEXT:    lui a6, 8
+; RV32-NEXT:    and a7, a5, a7
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    lui a7, 32
+; RV32-NEXT:    and a6, a5, a6
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a6
+; RV32-NEXT:    lui a6, 64
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a4
+; RV32-NEXT:    lui a4, 128
+; RV32-NEXT:    and a7, a5, a7
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    lui a7, 256
+; RV32-NEXT:    and a6, a5, a6
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a6
+; RV32-NEXT:    lui a6, 512
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a4
+; RV32-NEXT:    lui a4, 1024
+; RV32-NEXT:    and a7, a5, a7
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    lui a7, 2048
+; RV32-NEXT:    and a6, a5, a6
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a6
+; RV32-NEXT:    lui a6, 4096
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a4
+; RV32-NEXT:    lui a4, 8192
+; RV32-NEXT:    and a7, a5, a7
+; RV32-NEXT:    and a6, a5, a6
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a7
+; RV32-NEXT:    vxor.vv v24, v24, v0
+; RV32-NEXT:    vmul.vx v0, v8, a6
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vmul.vx v16, v8, a4
+; RV32-NEXT:    vxor.vv v16, v0, v16
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsll.vi v16, v16, 24
+; RV32-NEXT:    vand.vx v0, v24, a1
+; RV32-NEXT:    vsll.vi v0, v0, 8
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    lui a4, 16384
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vmul.vx v0, v8, a4
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vxor.vv v0, v16, v0
+; RV32-NEXT:    lui a4, 32768
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vmul.vx v16, v8, a4
+; RV32-NEXT:    vxor.vv v16, v0, v16
+; RV32-NEXT:    lui a4, 65536
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vmul.vx v0, v8, a4
+; RV32-NEXT:    vxor.vv v16, v16, v0
+; RV32-NEXT:    lui a4, 131072
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vmul.vx v0, v8, a4
+; RV32-NEXT:    vxor.vv v16, v16, v0
+; RV32-NEXT:    lui a4, 262144
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vmul.vx v0, v8, a4
+; RV32-NEXT:    vxor.vv v16, v16, v0
+; RV32-NEXT:    lui a4, 524288
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    vmul.vx v8, v8, a4
+; RV32-NEXT:    vxor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v24, 8
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vsrl.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vand.vx v16, v16, a3
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vx v8, v8, a2
+; RV32-NEXT:    vand.vx v16, v16, a2
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vx v8, v8, a0
+; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v8, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: clmulh_nxv16i32_vx:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -144
+; RV64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a0
+; RV64-NEXT:    vsrl.vi v16, v8, 8
+; RV64-NEXT:    lui a0, 16
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addi a0, a0, -256
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a2, a2, a1
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vsrl.vi v24, v0, 8
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vsrl.vi v16, v0, 24
+; RV64-NEXT:    vor.vv v24, v24, v16
+; RV64-NEXT:    vsll.vi v16, v8, 24
+; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vsll.vi v8, v8, 8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vi v16, v0, 24
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vsll.vi v0, v0, 8
+; RV64-NEXT:    vor.vv v0, v16, v0
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui t6, 349525
+; RV64-NEXT:    li s5, 16
+; RV64-NEXT:    li t2, 32
+; RV64-NEXT:    li a7, 256
+; RV64-NEXT:    li t0, 512
+; RV64-NEXT:    li t1, 1024
+; RV64-NEXT:    li s6, 1
+; RV64-NEXT:    lui t3, 1
+; RV64-NEXT:    lui t4, 2
+; RV64-NEXT:    lui t5, 4
+; RV64-NEXT:    lui s0, 8
+; RV64-NEXT:    lui s1, 32
+; RV64-NEXT:    lui s2, 64
+; RV64-NEXT:    lui s3, 128
+; RV64-NEXT:    lui s4, 256
+; RV64-NEXT:    lui s7, 512
+; RV64-NEXT:    lui s8, 1024
+; RV64-NEXT:    lui s9, 2048
+; RV64-NEXT:    lui s10, 4096
+; RV64-NEXT:    lui s11, 8192
+; RV64-NEXT:    lui ra, 16384
+; RV64-NEXT:    lui a5, 32768
+; RV64-NEXT:    addi a3, a1, -241
+; RV64-NEXT:    addi a2, a2, 819
+; RV64-NEXT:    addi a1, t6, 1365
+; RV64-NEXT:    vor.vv v16, v0, v24
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsll.vi v16, v16, 4
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsll.vi v16, v16, 2
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 1
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vor.vv v0, v24, v16
+; RV64-NEXT:    vand.vx v16, v0, s5
+; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    mv a4, a0
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    lui t6, 65536
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a4, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a4, a4, a0
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsll.vi v8, v8, 4
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vsll.vi v8, v8, 2
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vand.vx v16, v0, t2
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a4, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a4, a4, a0
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    lui a4, 131072
+; RV64-NEXT:    slli s6, s6, 11
+; RV64-NEXT:    lui t2, 262144
+; RV64-NEXT:    lui s5, 524288
+; RV64-NEXT:    li a6, 64
+; RV64-NEXT:    vand.vx v16, v0, a6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a0, a6
+; RV64-NEXT:    slli a6, a6, 5
+; RV64-NEXT:    add a6, a6, a0
+; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a6, 128
+; RV64-NEXT:    vand.vx v16, v0, a6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 8
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, a7
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t0
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t1
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 5
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t3
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t4
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t5
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s0
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 6
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    lui a6, 16
+; RV64-NEXT:    vand.vx v16, v0, a6
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s1
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s2
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s3
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 5
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s4
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s7
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s8
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s9
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s10
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s11
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 4
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, ra
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    mv a7, a6
+; RV64-NEXT:    slli a6, a6, 2
+; RV64-NEXT:    add a7, a7, a6
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vs8r.v v16, (a6) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, a5
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 5
+; RV64-NEXT:    mv a6, a5
+; RV64-NEXT:    slli a5, a5, 1
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 32
+; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t6
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 3
+; RV64-NEXT:    mv a6, a5
+; RV64-NEXT:    slli a5, a5, 1
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a5, 2
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 32
+; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, a4
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vi v16, v0, 2
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vi v24, v0, 1
+; RV64-NEXT:    vand.vi v16, v0, 4
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vi v16, v0, 8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, t2
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vand.vx v16, v0, s5
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vmul.vv v16, v8, v24
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v24, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v0, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 8
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    addi a4, sp, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 7
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 7
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 8
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v16, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    vxor.vv v16, v16, v24
+; RV64-NEXT:    vxor.vv v16, v16, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v16, v8
+; RV64-NEXT:    addi a4, sp, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 6
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 7
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v24, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v0, v24, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v0, v0, v8
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v8, v24, a0
+; RV64-NEXT:    vsll.vi v8, v8, 8
+; RV64-NEXT:    vor.vv v16, v16, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v0, v8
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 8
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a5, a5, a4
+; RV64-NEXT:    slli a4, a4, 1
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 32
+; RV64-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    vsrl.vi v24, v24, 8
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vsrl.vi v8, v8, 24
+; RV64-NEXT:    vor.vv v8, v24, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsll.vi v8, v8, 4
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vsll.vi v8, v8, 2
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v8, v8, 1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 144
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i64 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %va.ext = zext <vscale x 16 x i32> %va to <vscale x 16 x i64>
+  %vb.ext = zext <vscale x 16 x i32> %vb to <vscale x 16 x i64>
+  %clmul = call <vscale x 16 x i64> @llvm.clmul.nxv16i64(<vscale x 16 x i64> %va.ext, <vscale x 16 x i64> %vb.ext)
+  %res.ext = lshr <vscale x 16 x i64> %clmul, splat(i64 32)
+  %res = trunc <vscale x 16 x i64> %res.ext to <vscale x 16 x i32>
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 1 x i64> @clmulh_nxv1i64_vv(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv1i64_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -352
+; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v0, v0, v16
-; RV32V-NEXT:    vor.vv v24, v0, v24
-; RV32V-NEXT:    vand.vv v0, v8, v16
-; RV32V-NEXT:    vsll.vi v0, v0, 8
-; RV32V-NEXT:    vand.vx v16, v8, a3
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vmv1r.v v10, v8
+; RV32V-NEXT:    lui s7, 1044480
+; RV32V-NEXT:    lui s1, 524288
+; RV32V-NEXT:    li s11, 1
+; RV32V-NEXT:    li s6, 2
+; RV32V-NEXT:    li s5, 4
+; RV32V-NEXT:    li s10, 8
+; RV32V-NEXT:    li s9, 64
+; RV32V-NEXT:    li s8, 128
+; RV32V-NEXT:    li s4, 256
+; RV32V-NEXT:    li s3, 512
+; RV32V-NEXT:    li s2, 1024
+; RV32V-NEXT:    lui s0, 1
+; RV32V-NEXT:    lui t6, 2
+; RV32V-NEXT:    lui t5, 4
+; RV32V-NEXT:    lui t4, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t2, 32
+; RV32V-NEXT:    lui t1, 64
+; RV32V-NEXT:    lui t0, 128
+; RV32V-NEXT:    lui a7, 256
+; RV32V-NEXT:    lui a5, 512
+; RV32V-NEXT:    lui a4, 1024
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    lui a1, 4096
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    lui a3, 16384
+; RV32V-NEXT:    sw s7, 248(sp)
+; RV32V-NEXT:    lui ra, 32768
+; RV32V-NEXT:    sw zero, 252(sp)
+; RV32V-NEXT:    sw s1, 8(sp)
+; RV32V-NEXT:    lui a6, 524288
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s11, 276(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s6, 260(sp)
+; RV32V-NEXT:    lui s6, 65536
+; RV32V-NEXT:    sw zero, 264(sp)
+; RV32V-NEXT:    sw s5, 268(sp)
+; RV32V-NEXT:    lui s7, 131072
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw s10, 244(sp)
+; RV32V-NEXT:    lui s5, 262144
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    li s1, 16
+; RV32V-NEXT:    sw s1, 236(sp)
+; RV32V-NEXT:    li s1, 16
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    li s10, 32
+; RV32V-NEXT:    sw s10, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
+; RV32V-NEXT:    li s10, 64
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s8, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s4, 204(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s3, 196(sp)
+; RV32V-NEXT:    li s9, 512
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s2, 188(sp)
+; RV32V-NEXT:    li s2, 1024
+; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw s11, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s0, 172(sp)
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw t6, 164(sp)
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw t5, 156(sp)
+; RV32V-NEXT:    lui s8, 4
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw t4, 148(sp)
+; RV32V-NEXT:    lui t5, 8
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw t3, 140(sp)
+; RV32V-NEXT:    lui s0, 16
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t2, 132(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t1, 124(sp)
+; RV32V-NEXT:    lui t3, 64
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t0, 116(sp)
+; RV32V-NEXT:    lui s4, 128
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw a7, 108(sp)
+; RV32V-NEXT:    lui t4, 256
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw a5, 100(sp)
+; RV32V-NEXT:    lui t0, 512
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw a4, 92(sp)
+; RV32V-NEXT:    lui a5, 1024
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a2, 84(sp)
+; RV32V-NEXT:    lui a4, 2048
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a1, 76(sp)
+; RV32V-NEXT:    lui t1, 4096
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a0, 68(sp)
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a3, 60(sp)
+; RV32V-NEXT:    lui s3, 16384
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw ra, 52(sp)
+; RV32V-NEXT:    lui a7, 32768
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw s6, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw s7, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw s5, 28(sp)
+; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    sw a6, 20(sp)
+; RV32V-NEXT:    lui a0, 61681
+; RV32V-NEXT:    addi a0, a0, -241
+; RV32V-NEXT:    vmv.v.x v14, a0
+; RV32V-NEXT:    lui a0, 209715
+; RV32V-NEXT:    addi a0, a0, 819
+; RV32V-NEXT:    vmv.v.x v11, a0
+; RV32V-NEXT:    lui a0, 349525
+; RV32V-NEXT:    addi a0, a0, 1365
+; RV32V-NEXT:    vmv.v.x v12, a0
+; RV32V-NEXT:    addi a0, sp, 248
+; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32V-NEXT:    vlse64.v v13, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 8
+; RV32V-NEXT:    vlse64.v v7, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 272
+; RV32V-NEXT:    vlse64.v v31, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 256
+; RV32V-NEXT:    vlse64.v v30, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 264
+; RV32V-NEXT:    vlse64.v v29, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 240
+; RV32V-NEXT:    vlse64.v v26, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 232
+; RV32V-NEXT:    vlse64.v v23, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 224
+; RV32V-NEXT:    vlse64.v v20, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 216
+; RV32V-NEXT:    vlse64.v v17, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 208
+; RV32V-NEXT:    vlse64.v v15, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 200
+; RV32V-NEXT:    vlse64.v v8, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 192
+; RV32V-NEXT:    vlse64.v v18, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 184
+; RV32V-NEXT:    vlse64.v v16, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 176
+; RV32V-NEXT:    vlse64.v v19, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 168
+; RV32V-NEXT:    vlse64.v v21, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 160
+; RV32V-NEXT:    vlse64.v v22, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 152
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 144
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 136
+; RV32V-NEXT:    vlse64.v v25, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 128
+; RV32V-NEXT:    vlse64.v v27, (a0), zero
+; RV32V-NEXT:    li ra, 56
+; RV32V-NEXT:    vsrl.vi v6, v10, 24
+; RV32V-NEXT:    vsrl.vi v5, v10, 8
+; RV32V-NEXT:    vsrl.vx v4, v10, ra
+; RV32V-NEXT:    li a2, 40
+; RV32V-NEXT:    vsrl.vx v3, v10, a2
+; RV32V-NEXT:    vsll.vx v2, v10, ra
+; RV32V-NEXT:    vsrl.vx v1, v9, ra
+; RV32V-NEXT:    vsrl.vx v0, v9, a2
+; RV32V-NEXT:    addi a0, s0, -256
+; RV32V-NEXT:    vand.vx v3, v3, a0
+; RV32V-NEXT:    vor.vv v3, v3, v4
+; RV32V-NEXT:    vsll.vx v4, v9, ra
+; RV32V-NEXT:    vand.vx v0, v0, a0
+; RV32V-NEXT:    vor.vv v1, v0, v1
+; RV32V-NEXT:    vand.vx v0, v10, a0
+; RV32V-NEXT:    vsll.vx v0, v0, a2
+; RV32V-NEXT:    vor.vv v2, v2, v0
+; RV32V-NEXT:    vand.vx v0, v9, a0
+; RV32V-NEXT:    vsll.vx v0, v0, a2
+; RV32V-NEXT:    vor.vv v4, v4, v0
+; RV32V-NEXT:    vsrl.vi v0, v9, 24
+; RV32V-NEXT:    lui a1, 4080
+; RV32V-NEXT:    vand.vx v6, v6, a1
+; RV32V-NEXT:    vand.vv v5, v5, v13
+; RV32V-NEXT:    vor.vv v8, v5, v6
+; RV32V-NEXT:    vsrl.vi v6, v9, 8
+; RV32V-NEXT:    vand.vx v5, v0, a1
+; RV32V-NEXT:    vand.vv v6, v6, v13
+; RV32V-NEXT:    vor.vv v5, v6, v5
+; RV32V-NEXT:    addi a3, sp, 120
+; RV32V-NEXT:    vlse64.v v6, (a3), zero
+; RV32V-NEXT:    vor.vv v1, v5, v1
+; RV32V-NEXT:    vand.vx v5, v9, a1
+; RV32V-NEXT:    vsll.vi v5, v5, 24
+; RV32V-NEXT:    vand.vv v9, v9, v13
+; RV32V-NEXT:    vsll.vi v9, v9, 8
+; RV32V-NEXT:    vor.vv v9, v5, v9
+; RV32V-NEXT:    addi a3, sp, 112
+; RV32V-NEXT:    vlse64.v v5, (a3), zero
+; RV32V-NEXT:    vor.vv v9, v4, v9
+; RV32V-NEXT:    addi a3, sp, 104
+; RV32V-NEXT:    vlse64.v v4, (a3), zero
+; RV32V-NEXT:    vor.vv v9, v9, v1
+; RV32V-NEXT:    vsrl.vi v1, v9, 4
+; RV32V-NEXT:    vand.vv v9, v9, v14
+; RV32V-NEXT:    vand.vv v1, v1, v14
+; RV32V-NEXT:    vsll.vi v9, v9, 4
+; RV32V-NEXT:    vor.vv v9, v1, v9
+; RV32V-NEXT:    vsrl.vi v1, v9, 2
+; RV32V-NEXT:    vand.vv v9, v9, v11
+; RV32V-NEXT:    vand.vv v1, v1, v11
+; RV32V-NEXT:    vsll.vi v9, v9, 2
+; RV32V-NEXT:    vor.vv v9, v1, v9
+; RV32V-NEXT:    vsrl.vi v1, v9, 1
+; RV32V-NEXT:    vand.vv v9, v9, v12
+; RV32V-NEXT:    vand.vv v1, v1, v12
+; RV32V-NEXT:    vadd.vv v9, v9, v9
+; RV32V-NEXT:    vor.vv v9, v1, v9
+; RV32V-NEXT:    vand.vx v1, v9, s1
+; RV32V-NEXT:    vor.vv v8, v8, v3
+; RV32V-NEXT:    vand.vx v3, v10, a1
+; RV32V-NEXT:    vsll.vi v3, v3, 24
+; RV32V-NEXT:    vand.vv v10, v10, v13
+; RV32V-NEXT:    vsll.vi v10, v10, 8
+; RV32V-NEXT:    vor.vv v10, v3, v10
+; RV32V-NEXT:    li a3, 32
+; RV32V-NEXT:    vand.vx v3, v9, a3
+; RV32V-NEXT:    vor.vv v10, v2, v10
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v10, v8, 4
+; RV32V-NEXT:    vand.vv v8, v8, v14
+; RV32V-NEXT:    vand.vv v10, v10, v14
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v10, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v11
+; RV32V-NEXT:    vand.vv v10, v10, v11
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v10, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v2, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v2, v10
+; RV32V-NEXT:    vand.vi v2, v9, 4
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vi v2, v9, 8
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vx v2, v9, s10
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    li a3, 128
+; RV32V-NEXT:    vand.vx v1, v9, a3
+; RV32V-NEXT:    vmul.vv v3, v8, v3
+; RV32V-NEXT:    vxor.vv v10, v10, v3
+; RV32V-NEXT:    li a3, 256
+; RV32V-NEXT:    vand.vx v0, v9, a3
+; RV32V-NEXT:    vmul.vv v3, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v3
+; RV32V-NEXT:    vand.vx v2, v9, s9
+; RV32V-NEXT:    vmul.vv v3, v8, v1
+; RV32V-NEXT:    vxor.vv v3, v10, v3
+; RV32V-NEXT:    vand.vx v10, v9, s2
+; RV32V-NEXT:    vmul.vv v1, v8, v0
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v1, v3, v1
+; RV32V-NEXT:    vxor.vv v2, v1, v2
+; RV32V-NEXT:    vand.vx v1, v9, s11
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vxor.vv v10, v2, v10
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    vand.vx v2, v9, a3
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    lui a3, 2
+; RV32V-NEXT:    vand.vx v1, v9, a3
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vx v2, v9, s8
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, t5
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vx v2, v9, s0
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, t2
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vx v2, v9, t3
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, s4
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vx v2, v9, t4
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, t0
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vx v2, v9, a5
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, a4
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v10, v10, v2
+; RV32V-NEXT:    vand.vx v0, v9, t1
+; RV32V-NEXT:    vmul.vv v2, v8, v1
+; RV32V-NEXT:    vxor.vv v2, v10, v2
+; RV32V-NEXT:    vand.vx v10, v9, t6
+; RV32V-NEXT:    vmul.vv v1, v8, v0
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vxor.vv v1, v2, v1
+; RV32V-NEXT:    vxor.vv v10, v1, v10
+; RV32V-NEXT:    vand.vx v1, v9, s3
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, a7
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, s6
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, s7
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vand.vx v1, v9, s5
+; RV32V-NEXT:    addi a3, sp, 96
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v10, v10, v1
+; RV32V-NEXT:    vlse64.v v1, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 88
+; RV32V-NEXT:    vand.vv v7, v9, v7
+; RV32V-NEXT:    vmul.vv v7, v8, v7
+; RV32V-NEXT:    vxor.vv v10, v10, v7
+; RV32V-NEXT:    vlse64.v v7, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 80
+; RV32V-NEXT:    vand.vv v31, v9, v31
+; RV32V-NEXT:    vmul.vv v31, v8, v31
+; RV32V-NEXT:    vxor.vv v10, v10, v31
+; RV32V-NEXT:    vlse64.v v31, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 72
+; RV32V-NEXT:    vand.vv v30, v9, v30
+; RV32V-NEXT:    vmul.vv v30, v8, v30
+; RV32V-NEXT:    vxor.vv v10, v10, v30
+; RV32V-NEXT:    vlse64.v v30, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 64
+; RV32V-NEXT:    vand.vv v29, v9, v29
+; RV32V-NEXT:    vmul.vv v29, v8, v29
+; RV32V-NEXT:    vxor.vv v10, v10, v29
+; RV32V-NEXT:    vlse64.v v29, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 56
+; RV32V-NEXT:    vand.vv v26, v9, v26
+; RV32V-NEXT:    vmul.vv v26, v8, v26
+; RV32V-NEXT:    vxor.vv v10, v10, v26
+; RV32V-NEXT:    vlse64.v v26, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 48
+; RV32V-NEXT:    vand.vv v23, v9, v23
+; RV32V-NEXT:    vmul.vv v23, v8, v23
+; RV32V-NEXT:    vxor.vv v10, v10, v23
+; RV32V-NEXT:    vlse64.v v23, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 40
+; RV32V-NEXT:    vand.vv v20, v9, v20
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v10, v10, v20
+; RV32V-NEXT:    vlse64.v v20, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 32
+; RV32V-NEXT:    vand.vv v17, v9, v17
+; RV32V-NEXT:    vmul.vv v17, v8, v17
+; RV32V-NEXT:    vxor.vv v10, v10, v17
+; RV32V-NEXT:    vlse64.v v17, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 24
+; RV32V-NEXT:    vand.vv v15, v9, v15
+; RV32V-NEXT:    vmul.vv v15, v8, v15
+; RV32V-NEXT:    vxor.vv v15, v10, v15
+; RV32V-NEXT:    vlse64.v v0, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    addi a4, sp, 288
+; RV32V-NEXT:    vl1r.v v10, (a4) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vand.vv v10, v9, v10
+; RV32V-NEXT:    vand.vv v18, v9, v18
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v18, v8, v18
+; RV32V-NEXT:    vxor.vv v10, v15, v10
+; RV32V-NEXT:    vxor.vv v10, v10, v18
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v16, v9, v16
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v10, v10, v16
+; RV32V-NEXT:    vand.vv v16, v9, v19
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v10, v10, v16
+; RV32V-NEXT:    vand.vv v16, v9, v21
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v10, v10, v16
+; RV32V-NEXT:    vand.vv v16, v9, v22
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v10, v10, v16
+; RV32V-NEXT:    vand.vv v16, v9, v24
+; RV32V-NEXT:    vand.vv v19, v9, v28
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v19, v8, v19
+; RV32V-NEXT:    vxor.vv v10, v10, v16
+; RV32V-NEXT:    vand.vx v16, v15, a1
+; RV32V-NEXT:    vxor.vv v10, v10, v19
+; RV32V-NEXT:    vsrl.vi v19, v10, 24
+; RV32V-NEXT:    vand.vx v19, v19, a1
+; RV32V-NEXT:    vand.vv v21, v9, v25
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vand.vv v21, v9, v27
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vand.vv v21, v9, v6
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vand.vv v21, v9, v5
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vand.vv v21, v9, v4
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vand.vv v21, v9, v1
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vand.vv v21, v9, v7
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vand.vv v21, v9, v31
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vand.vx v22, v2, a0
+; RV32V-NEXT:    vsll.vx v22, v22, a2
+; RV32V-NEXT:    vxor.vv v10, v10, v21
+; RV32V-NEXT:    vsrl.vx v21, v10, a2
+; RV32V-NEXT:    vand.vx v21, v21, a0
+; RV32V-NEXT:    vand.vv v24, v9, v30
+; RV32V-NEXT:    vand.vv v25, v9, v29
+; RV32V-NEXT:    vand.vv v26, v9, v26
+; RV32V-NEXT:    vand.vv v23, v9, v23
+; RV32V-NEXT:    vand.vv v20, v9, v20
+; RV32V-NEXT:    vand.vv v17, v9, v17
+; RV32V-NEXT:    vand.vv v27, v9, v0
+; RV32V-NEXT:    vand.vv v9, v9, v18
+; RV32V-NEXT:    vmul.vv v18, v8, v24
+; RV32V-NEXT:    vmul.vv v24, v8, v25
+; RV32V-NEXT:    vmul.vv v25, v8, v26
+; RV32V-NEXT:    vmul.vv v23, v8, v23
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vmul.vv v17, v8, v17
+; RV32V-NEXT:    vmul.vv v26, v8, v27
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v9, v10, v18
+; RV32V-NEXT:    vxor.vv v9, v9, v24
+; RV32V-NEXT:    vxor.vv v9, v9, v25
+; RV32V-NEXT:    vxor.vv v9, v9, v23
+; RV32V-NEXT:    vxor.vv v9, v9, v20
+; RV32V-NEXT:    vxor.vv v9, v9, v17
+; RV32V-NEXT:    vxor.vv v9, v9, v26
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsll.vx v9, v3, ra
+; RV32V-NEXT:    vsrl.vx v8, v8, ra
+; RV32V-NEXT:    vor.vv v9, v9, v22
+; RV32V-NEXT:    vsrl.vi v15, v15, 8
 ; RV32V-NEXT:    vsll.vi v16, v16, 24
-; RV32V-NEXT:    vor.vv v16, v16, v0
-; RV32V-NEXT:    vsll.vx v0, v8, t5
-; RV32V-NEXT:    vand.vx v8, v8, t3
-; RV32V-NEXT:    vsll.vx v8, v8, t4
-; RV32V-NEXT:    vor.vv v8, v0, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v24, v24, v0
-; RV32V-NEXT:    vor.vv v8, v8, v16
-; RV32V-NEXT:    vor.vv v8, v8, v24
-; RV32V-NEXT:    vsrl.vi v16, v8, 4
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vand.vv v15, v15, v13
+; RV32V-NEXT:    vor.vv v15, v15, v19
+; RV32V-NEXT:    vand.vv v10, v10, v13
+; RV32V-NEXT:    vsll.vi v10, v10, 8
+; RV32V-NEXT:    vor.vv v10, v16, v10
+; RV32V-NEXT:    vor.vv v9, v9, v10
+; RV32V-NEXT:    vor.vv v8, v21, v8
+; RV32V-NEXT:    vor.vv v8, v15, v8
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 4
+; RV32V-NEXT:    vand.vv v8, v8, v14
+; RV32V-NEXT:    vand.vv v9, v9, v14
 ; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v11
+; RV32V-NEXT:    vand.vv v9, v9, v11
 ; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v9, v9, v12
 ; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vor.vv v8, v9, v8
 ; RV32V-NEXT:    vsrl.vi v8, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add sp, sp, a0
 ; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
@@ -42719,1325 +10446,5747 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32V-NEXT:    addi sp, sp, 352
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv8i64_vv:
+; RV64V-LABEL: clmulh_nxv1i64_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi sp, sp, -416
-; RV64V-NEXT:    sd ra, 408(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s0, 400(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s1, 392(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s2, 384(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s3, 376(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s4, 368(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s5, 360(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s6, 352(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s7, 344(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s8, 336(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s9, 328(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s10, 320(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s11, 312(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    addi sp, sp, -208
+; RV64V-NEXT:    sd ra, 200(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 192(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s1, 184(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s2, 176(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s3, 168(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s4, 160(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s5, 152(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s6, 144(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s7, 136(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s8, 128(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s10, 112(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s11, 104(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    li s6, 56
+; RV64V-NEXT:    lui t4, 16
+; RV64V-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64V-NEXT:    vsrl.vi v16, v8, 24
+; RV64V-NEXT:    vsrl.vi v10, v8, 8
+; RV64V-NEXT:    li t3, 255
+; RV64V-NEXT:    lui a0, 61681
+; RV64V-NEXT:    lui a1, 209715
+; RV64V-NEXT:    lui a5, 349525
+; RV64V-NEXT:    vsrl.vi v12, v9, 24
+; RV64V-NEXT:    vsrl.vi v11, v9, 8
+; RV64V-NEXT:    li ra, 16
+; RV64V-NEXT:    li s11, 32
+; RV64V-NEXT:    li s10, 64
+; RV64V-NEXT:    li s8, 128
+; RV64V-NEXT:    li s9, 256
+; RV64V-NEXT:    li a3, 512
+; RV64V-NEXT:    li a4, 1024
+; RV64V-NEXT:    li t0, 1
+; RV64V-NEXT:    lui s5, 1
+; RV64V-NEXT:    lui s4, 2
+; RV64V-NEXT:    lui s3, 4
+; RV64V-NEXT:    lui a7, 8
+; RV64V-NEXT:    lui t1, 32
+; RV64V-NEXT:    lui t2, 64
+; RV64V-NEXT:    lui s1, 128
+; RV64V-NEXT:    lui s2, 256
+; RV64V-NEXT:    addi t5, a0, -241
+; RV64V-NEXT:    addi t6, a1, 819
+; RV64V-NEXT:    addi s0, a5, 1365
+; RV64V-NEXT:    slli a0, t5, 32
+; RV64V-NEXT:    add t5, t5, a0
+; RV64V-NEXT:    slli a0, t6, 32
+; RV64V-NEXT:    add t6, t6, a0
+; RV64V-NEXT:    slli a0, s0, 32
+; RV64V-NEXT:    add s0, s0, a0
+; RV64V-NEXT:    addi t4, t4, -256
+; RV64V-NEXT:    lui a2, 16
+; RV64V-NEXT:    slli t3, t3, 24
+; RV64V-NEXT:    vsrl.vx v13, v9, s6
+; RV64V-NEXT:    li a0, 40
+; RV64V-NEXT:    vsrl.vx v14, v9, a0
+; RV64V-NEXT:    lui a1, 4080
+; RV64V-NEXT:    vand.vx v12, v12, a1
+; RV64V-NEXT:    vand.vx v15, v9, a1
+; RV64V-NEXT:    vsll.vx v17, v9, s6
+; RV64V-NEXT:    vand.vx v14, v14, t4
+; RV64V-NEXT:    vand.vx v11, v11, t3
+; RV64V-NEXT:    vsll.vi v15, v15, 24
+; RV64V-NEXT:    vand.vx v18, v9, t3
+; RV64V-NEXT:    vand.vx v9, v9, t4
+; RV64V-NEXT:    vor.vv v13, v14, v13
+; RV64V-NEXT:    vor.vv v11, v11, v12
+; RV64V-NEXT:    vsll.vi v12, v18, 8
+; RV64V-NEXT:    vsll.vx v9, v9, a0
+; RV64V-NEXT:    li a5, 40
+; RV64V-NEXT:    vor.vv v11, v11, v13
+; RV64V-NEXT:    vor.vv v12, v15, v12
+; RV64V-NEXT:    vor.vv v9, v17, v9
+; RV64V-NEXT:    vor.vv v9, v9, v12
+; RV64V-NEXT:    vor.vv v9, v9, v11
+; RV64V-NEXT:    vsrl.vi v11, v9, 4
+; RV64V-NEXT:    vand.vx v9, v9, t5
+; RV64V-NEXT:    vand.vx v11, v11, t5
+; RV64V-NEXT:    vsll.vi v9, v9, 4
+; RV64V-NEXT:    vor.vv v9, v11, v9
+; RV64V-NEXT:    vsrl.vi v11, v9, 2
+; RV64V-NEXT:    vand.vx v9, v9, t6
+; RV64V-NEXT:    vand.vx v11, v11, t6
+; RV64V-NEXT:    vsll.vi v9, v9, 2
+; RV64V-NEXT:    vor.vv v9, v11, v9
+; RV64V-NEXT:    vsrl.vi v11, v9, 1
+; RV64V-NEXT:    vand.vx v9, v9, s0
+; RV64V-NEXT:    vand.vx v11, v11, s0
+; RV64V-NEXT:    vadd.vv v9, v9, v9
+; RV64V-NEXT:    vor.vv v11, v11, v9
+; RV64V-NEXT:    vand.vx v14, v11, ra
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vand.vx v15, v11, s11
+; RV64V-NEXT:    lui a1, 8192
+; RV64V-NEXT:    vand.vx v17, v11, s10
+; RV64V-NEXT:    lui a6, 16384
+; RV64V-NEXT:    vand.vx v18, v11, s8
+; RV64V-NEXT:    lui s6, 32768
+; RV64V-NEXT:    vand.vx v19, v11, s9
+; RV64V-NEXT:    lui s7, 65536
+; RV64V-NEXT:    vand.vx v20, v11, a3
+; RV64V-NEXT:    lui s8, 131072
+; RV64V-NEXT:    vand.vx v21, v11, a4
+; RV64V-NEXT:    slli a3, t0, 11
+; RV64V-NEXT:    vand.vx v22, v11, a3
+; RV64V-NEXT:    lui s10, 262144
+; RV64V-NEXT:    li a3, 56
+; RV64V-NEXT:    vsrl.vx v4, v8, a3
+; RV64V-NEXT:    vsrl.vx v0, v8, a5
+; RV64V-NEXT:    li a5, 40
+; RV64V-NEXT:    lui a4, 4080
+; RV64V-NEXT:    vand.vx v1, v16, a4
+; RV64V-NEXT:    vand.vx v3, v8, a4
+; RV64V-NEXT:    vsll.vx v2, v8, a3
+; RV64V-NEXT:    vand.vx v23, v11, s5
+; RV64V-NEXT:    slli s11, t0, 31
+; RV64V-NEXT:    vand.vx v24, v11, s4
+; RV64V-NEXT:    slli ra, t0, 32
+; RV64V-NEXT:    vand.vx v25, v11, s3
+; RV64V-NEXT:    slli s9, t0, 33
+; RV64V-NEXT:    vand.vx v26, v11, a7
+; RV64V-NEXT:    slli a3, t0, 34
+; RV64V-NEXT:    sd a3, 80(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v27, v11, a2
+; RV64V-NEXT:    slli a2, t0, 35
+; RV64V-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v28, v11, t1
+; RV64V-NEXT:    slli a2, t0, 36
+; RV64V-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v29, v11, t2
+; RV64V-NEXT:    slli a2, t0, 37
+; RV64V-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v30, v11, s1
+; RV64V-NEXT:    slli a2, t0, 38
+; RV64V-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v31, v11, s2
+; RV64V-NEXT:    slli a2, t0, 39
+; RV64V-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    lui a2, 512
+; RV64V-NEXT:    vand.vx v7, v11, a2
+; RV64V-NEXT:    slli a2, t0, 40
+; RV64V-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    lui a2, 1024
+; RV64V-NEXT:    vand.vx v6, v11, a2
+; RV64V-NEXT:    slli a2, t0, 41
+; RV64V-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    lui a2, 2048
+; RV64V-NEXT:    vand.vx v9, v11, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s2, t0, 42
+; RV64V-NEXT:    vand.vx v9, v11, a0
+; RV64V-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a2, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a2, a2, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s3, t0, 43
+; RV64V-NEXT:    vand.vx v9, v11, a1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s4, t0, 44
+; RV64V-NEXT:    vand.vx v9, v11, a6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s5, t0, 45
+; RV64V-NEXT:    vand.vx v9, v11, s6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s6, t0, 46
+; RV64V-NEXT:    vand.vx v9, v0, t4
+; RV64V-NEXT:    vor.vv v9, v9, v4
+; RV64V-NEXT:    vand.vx v12, v11, s7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s7, t0, 47
+; RV64V-NEXT:    vand.vx v10, v10, t3
+; RV64V-NEXT:    vor.vv v10, v10, v1
+; RV64V-NEXT:    vand.vx v12, v11, s8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s8, t0, 48
+; RV64V-NEXT:    vsll.vi v12, v3, 24
+; RV64V-NEXT:    vor.vv v9, v10, v9
+; RV64V-NEXT:    vand.vx v10, v8, t3
+; RV64V-NEXT:    vsll.vi v10, v10, 8
+; RV64V-NEXT:    vor.vv v10, v12, v10
+; RV64V-NEXT:    vand.vx v12, v11, s10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s10, t0, 49
+; RV64V-NEXT:    vand.vx v8, v8, t4
+; RV64V-NEXT:    vsll.vx v8, v8, a5
+; RV64V-NEXT:    vor.vv v8, v2, v8
+; RV64V-NEXT:    vand.vx v12, v11, s11
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s11, t0, 50
+; RV64V-NEXT:    vor.vv v8, v8, v10
+; RV64V-NEXT:    vand.vx v10, v11, ra
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli ra, t0, 51
+; RV64V-NEXT:    vor.vv v8, v8, v9
+; RV64V-NEXT:    vsrl.vi v9, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, t5
+; RV64V-NEXT:    vand.vx v9, v9, t5
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, t6
+; RV64V-NEXT:    vand.vx v9, v9, t6
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, s0
+; RV64V-NEXT:    vand.vx v9, v9, s0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vand.vx v9, v11, s9
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli t2, t0, 52
+; RV64V-NEXT:    slli t1, t0, 53
+; RV64V-NEXT:    slli a7, t0, 54
+; RV64V-NEXT:    slli s9, t0, 55
+; RV64V-NEXT:    slli a6, t0, 56
+; RV64V-NEXT:    slli a5, t0, 57
+; RV64V-NEXT:    slli a4, t0, 58
+; RV64V-NEXT:    slli a2, t0, 59
+; RV64V-NEXT:    slli a1, t0, 60
+; RV64V-NEXT:    slli a3, t0, 61
+; RV64V-NEXT:    slli t0, t0, 62
+; RV64V-NEXT:    li a0, -1
+; RV64V-NEXT:    slli a0, a0, 63
+; RV64V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 5
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s1, 72(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s2, s1, 5
+; RV64V-NEXT:    sub s1, s2, s1
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s1, 64(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v9, v11, s1
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 3
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s2
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 3
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s3
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s4
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s5
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s6
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 2
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s7
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    add s2, s2, s1
+; RV64V-NEXT:    slli s1, s1, 3
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s8
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 1
+; RV64V-NEXT:    mv s2, s1
+; RV64V-NEXT:    slli s1, s1, 3
+; RV64V-NEXT:    add s1, s1, s2
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s10
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s2, s1, 4
+; RV64V-NEXT:    add s1, s2, s1
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s11
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s1, s1, 4
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, ra
+; RV64V-NEXT:    csrr s1, vlenb
+; RV64V-NEXT:    slli s2, s1, 4
+; RV64V-NEXT:    sub s1, s2, s1
+; RV64V-NEXT:    add s1, sp, s1
+; RV64V-NEXT:    addi s1, s1, 96
+; RV64V-NEXT:    vs1r.v v9, (s1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, t2
+; RV64V-NEXT:    csrr t2, vlenb
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    mv s1, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add s1, s1, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add t2, t2, s1
+; RV64V-NEXT:    add t2, sp, t2
+; RV64V-NEXT:    addi t2, t2, 96
+; RV64V-NEXT:    vs1r.v v9, (t2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, t1
+; RV64V-NEXT:    csrr t1, vlenb
+; RV64V-NEXT:    mv t2, t1
+; RV64V-NEXT:    slli t1, t1, 2
+; RV64V-NEXT:    add t2, t2, t1
+; RV64V-NEXT:    slli t1, t1, 1
+; RV64V-NEXT:    add t1, t1, t2
+; RV64V-NEXT:    add t1, sp, t1
+; RV64V-NEXT:    addi t1, t1, 96
+; RV64V-NEXT:    vs1r.v v9, (t1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, a7
+; RV64V-NEXT:    csrr a7, vlenb
+; RV64V-NEXT:    slli a7, a7, 2
+; RV64V-NEXT:    mv t1, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add a7, a7, t1
+; RV64V-NEXT:    add a7, sp, a7
+; RV64V-NEXT:    addi a7, a7, 96
+; RV64V-NEXT:    vs1r.v v9, (a7) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, s9
+; RV64V-NEXT:    csrr a7, vlenb
+; RV64V-NEXT:    mv t1, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add t1, t1, a7
+; RV64V-NEXT:    slli a7, a7, 2
+; RV64V-NEXT:    add a7, a7, t1
+; RV64V-NEXT:    add a7, sp, a7
+; RV64V-NEXT:    addi a7, a7, 96
+; RV64V-NEXT:    vs1r.v v9, (a7) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 96
+; RV64V-NEXT:    vs1r.v v9, (a6) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, a5
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a6, a5, 3
+; RV64V-NEXT:    add a5, a6, a5
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 96
+; RV64V-NEXT:    vs1r.v v9, (a5) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, a4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 96
+; RV64V-NEXT:    vs1r.v v9, (a4) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a4, a2, 3
+; RV64V-NEXT:    sub a2, a4, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v9, v11, a1
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 96
+; RV64V-NEXT:    vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vi v9, v11, 2
+; RV64V-NEXT:    vand.vi v10, v11, 1
+; RV64V-NEXT:    vand.vi v12, v11, 4
+; RV64V-NEXT:    vand.vi v13, v11, 8
+; RV64V-NEXT:    vand.vx v5, v11, a3
+; RV64V-NEXT:    vand.vx v4, v11, t0
+; RV64V-NEXT:    vand.vx v2, v11, a0
+; RV64V-NEXT:    vmul.vv v3, v8, v9
+; RV64V-NEXT:    vmul.vv v9, v8, v10
+; RV64V-NEXT:    vmul.vv v10, v8, v12
+; RV64V-NEXT:    vmul.vv v11, v8, v13
+; RV64V-NEXT:    vmul.vv v12, v8, v14
+; RV64V-NEXT:    vmul.vv v13, v8, v15
+; RV64V-NEXT:    vmul.vv v14, v8, v17
+; RV64V-NEXT:    vmul.vv v15, v8, v18
+; RV64V-NEXT:    vmul.vv v16, v8, v19
+; RV64V-NEXT:    vmul.vv v17, v8, v20
+; RV64V-NEXT:    vmul.vv v18, v8, v21
+; RV64V-NEXT:    vmul.vv v19, v8, v22
+; RV64V-NEXT:    vmul.vv v20, v8, v23
+; RV64V-NEXT:    vmul.vv v21, v8, v24
+; RV64V-NEXT:    vmul.vv v22, v8, v25
+; RV64V-NEXT:    vmul.vv v23, v8, v26
+; RV64V-NEXT:    vmul.vv v24, v8, v27
+; RV64V-NEXT:    vmul.vv v25, v8, v28
+; RV64V-NEXT:    vmul.vv v26, v8, v29
+; RV64V-NEXT:    vmul.vv v27, v8, v30
+; RV64V-NEXT:    vmul.vv v28, v8, v31
+; RV64V-NEXT:    vmul.vv v29, v8, v7
+; RV64V-NEXT:    vmul.vv v30, v8, v6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v31, v8, v31
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v7, v8, v7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v6, v8, v6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 2
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 1
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    addi a0, sp, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    sub sp, sp, a0
-; RV64V-NEXT:    li a1, 56
-; RV64V-NEXT:    li a2, 40
-; RV64V-NEXT:    lui a3, 16
-; RV64V-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64V-NEXT:    vsrl.vx v24, v8, a1
-; RV64V-NEXT:    vsrl.vx v0, v8, a2
-; RV64V-NEXT:    addi a2, a3, -256
-; RV64V-NEXT:    vand.vx v0, v0, a2
-; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 4
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 4
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 4
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vsrl.vi v24, v8, 24
-; RV64V-NEXT:    lui a1, 4080
-; RV64V-NEXT:    li s4, 255
-; RV64V-NEXT:    vand.vx v24, v24, a1
-; RV64V-NEXT:    slli s4, s4, 24
-; RV64V-NEXT:    vsrl.vi v0, v8, 8
-; RV64V-NEXT:    vand.vx v0, v0, s4
-; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 3
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    li a1, 40
-; RV64V-NEXT:    vsrl.vx v24, v16, a1
-; RV64V-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v24, v24, a2
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 3
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v8, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 96
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v1, v8, v5
+; RV64V-NEXT:    vmul.vv v5, v8, v4
+; RV64V-NEXT:    vmul.vv v8, v8, v2
+; RV64V-NEXT:    vxor.vv v9, v9, v3
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v12
+; RV64V-NEXT:    vxor.vv v9, v9, v13
+; RV64V-NEXT:    vxor.vv v9, v9, v14
+; RV64V-NEXT:    vxor.vv v9, v9, v15
+; RV64V-NEXT:    vxor.vv v10, v9, v16
+; RV64V-NEXT:    vxor.vv v10, v10, v17
+; RV64V-NEXT:    vxor.vv v10, v10, v18
+; RV64V-NEXT:    vxor.vv v10, v10, v19
+; RV64V-NEXT:    vxor.vv v10, v10, v20
+; RV64V-NEXT:    vxor.vv v10, v10, v21
+; RV64V-NEXT:    vxor.vv v10, v10, v22
+; RV64V-NEXT:    vxor.vv v10, v10, v23
+; RV64V-NEXT:    vxor.vv v10, v10, v24
+; RV64V-NEXT:    vxor.vv v10, v10, v25
+; RV64V-NEXT:    vxor.vv v10, v10, v26
+; RV64V-NEXT:    vxor.vv v10, v10, v27
+; RV64V-NEXT:    vxor.vv v10, v10, v28
+; RV64V-NEXT:    vxor.vv v10, v10, v29
+; RV64V-NEXT:    vxor.vv v10, v10, v30
+; RV64V-NEXT:    vxor.vv v10, v10, v31
+; RV64V-NEXT:    vxor.vv v11, v10, v7
+; RV64V-NEXT:    vxor.vv v11, v11, v6
 ; RV64V-NEXT:    li a0, 56
-; RV64V-NEXT:    vsrl.vx v0, v16, a0
-; RV64V-NEXT:    vor.vv v24, v24, v0
+; RV64V-NEXT:    vsll.vx v9, v9, a0
+; RV64V-NEXT:    vand.vx v10, v10, t4
+; RV64V-NEXT:    li a1, 40
+; RV64V-NEXT:    vsll.vx v10, v10, a1
+; RV64V-NEXT:    vor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 2
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v0
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 1
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    addi a2, sp, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 5
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 5
+; RV64V-NEXT:    sub a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 4
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vsrl.vi v13, v10, 8
+; RV64V-NEXT:    vand.vx v13, v13, t3
+; RV64V-NEXT:    vsrl.vi v11, v11, 24
+; RV64V-NEXT:    lui a2, 4080
+; RV64V-NEXT:    vand.vx v11, v11, a2
+; RV64V-NEXT:    vor.vv v11, v13, v11
 ; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
 ; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 96
+; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    csrr a3, vlenb
 ; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    slli a3, a3, 2
 ; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 96
+; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    csrr a3, vlenb
 ; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
 ; RV64V-NEXT:    add a3, a3, a4
 ; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vsrl.vi v24, v16, 24
-; RV64V-NEXT:    lui a3, 4080
-; RV64V-NEXT:    vand.vx v24, v24, a3
-; RV64V-NEXT:    vsrl.vi v0, v16, 8
-; RV64V-NEXT:    vand.vx v0, v0, s4
-; RV64V-NEXT:    vor.vv v24, v0, v24
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 2
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 4
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 2
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v24, v0, v24
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 2
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v0, v8, a3
-; RV64V-NEXT:    vsll.vi v0, v0, 24
-; RV64V-NEXT:    vand.vx v24, v8, s4
-; RV64V-NEXT:    sd s4, 288(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vsll.vi v24, v24, 8
-; RV64V-NEXT:    vor.vv v24, v0, v24
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 7
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 2
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vsll.vx v24, v8, a0
-; RV64V-NEXT:    li a0, 56
-; RV64V-NEXT:    vand.vx v8, v8, a2
-; RV64V-NEXT:    vsll.vx v8, v8, a1
-; RV64V-NEXT:    vor.vv v8, v24, v8
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 4
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 2
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 1
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v24, v0, v24
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 4
-; RV64V-NEXT:    mv a5, a4
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a5, a5, a4
-; RV64V-NEXT:    slli a4, a4, 2
-; RV64V-NEXT:    add a4, a4, a5
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 304
-; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v24, v16, a3
-; RV64V-NEXT:    vsll.vi v24, v24, 24
-; RV64V-NEXT:    vand.vx v0, v16, s4
-; RV64V-NEXT:    vsll.vi v0, v0, 8
-; RV64V-NEXT:    vor.vv v24, v24, v0
-; RV64V-NEXT:    vsll.vx v0, v16, a0
-; RV64V-NEXT:    vand.vx v16, v16, a2
-; RV64V-NEXT:    vsll.vx v16, v16, a1
-; RV64V-NEXT:    vor.vv v16, v0, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v8, v8, v0
-; RV64V-NEXT:    vor.vv v16, v16, v24
-; RV64V-NEXT:    lui a0, 61681
-; RV64V-NEXT:    lui a1, 209715
-; RV64V-NEXT:    lui a2, 349525
-; RV64V-NEXT:    li a4, 16
-; RV64V-NEXT:    li a3, 32
-; RV64V-NEXT:    li t2, 1
-; RV64V-NEXT:    addi a7, a0, -241
-; RV64V-NEXT:    addi t0, a1, 819
-; RV64V-NEXT:    addi t1, a2, 1365
-; RV64V-NEXT:    slli a0, t2, 11
-; RV64V-NEXT:    sd a0, 248(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 31
-; RV64V-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 32
-; RV64V-NEXT:    sd a0, 232(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 33
-; RV64V-NEXT:    sd a0, 224(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 34
-; RV64V-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 35
-; RV64V-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 36
-; RV64V-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 37
-; RV64V-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 38
-; RV64V-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 39
-; RV64V-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 40
-; RV64V-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 41
-; RV64V-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 42
-; RV64V-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 43
-; RV64V-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 44
-; RV64V-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, a7, 32
-; RV64V-NEXT:    add a7, a7, a0
-; RV64V-NEXT:    slli a0, t0, 32
-; RV64V-NEXT:    add t0, t0, a0
-; RV64V-NEXT:    slli a0, t1, 32
-; RV64V-NEXT:    add a0, t1, a0
-; RV64V-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a1, t2, 45
-; RV64V-NEXT:    sd a1, 128(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 304
-; RV64V-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v16, v16, v24
-; RV64V-NEXT:    vsrl.vi v24, v16, 4
-; RV64V-NEXT:    vand.vx v16, v16, a7
-; RV64V-NEXT:    vand.vx v24, v24, a7
-; RV64V-NEXT:    sd a7, 256(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vsll.vi v16, v16, 4
-; RV64V-NEXT:    vor.vv v16, v24, v16
-; RV64V-NEXT:    vsrl.vi v24, v16, 2
-; RV64V-NEXT:    vand.vx v16, v16, t0
-; RV64V-NEXT:    vand.vx v24, v24, t0
-; RV64V-NEXT:    sd t0, 264(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vsll.vi v16, v16, 2
-; RV64V-NEXT:    vor.vv v16, v24, v16
-; RV64V-NEXT:    vsrl.vi v24, v16, 1
-; RV64V-NEXT:    vand.vx v16, v16, a0
-; RV64V-NEXT:    vand.vx v24, v24, a0
-; RV64V-NEXT:    vadd.vv v16, v16, v16
-; RV64V-NEXT:    vor.vv v0, v24, v16
-; RV64V-NEXT:    vand.vx v16, v0, a4
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 4
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 3
-; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 2
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 304
-; RV64V-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    slli a1, t2, 46
-; RV64V-NEXT:    sd a1, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    addi a3, a3, 96
+; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 96
+; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 96
+; RV64V-NEXT:    vl1r.v v13, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vand.vx v10, v10, a2
+; RV64V-NEXT:    vsll.vi v10, v10, 24
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v13, v12, v13
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v14, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v13, v13, v14
+; RV64V-NEXT:    vand.vx v14, v12, t3
+; RV64V-NEXT:    vsll.vi v14, v14, 8
+; RV64V-NEXT:    vor.vv v10, v10, v14
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v14, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v13, v13, v14
+; RV64V-NEXT:    vor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 96
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v13, v10
+; RV64V-NEXT:    vsrl.vx v12, v12, a1
+; RV64V-NEXT:    vand.vx v12, v12, t4
 ; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 3
 ; RV64V-NEXT:    mv a2, a1
 ; RV64V-NEXT:    slli a1, a1, 1
 ; RV64V-NEXT:    add a2, a2, a1
-; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    slli a1, a1, 2
 ; RV64V-NEXT:    add a2, a2, a1
 ; RV64V-NEXT:    slli a1, a1, 2
 ; RV64V-NEXT:    add a1, a1, a2
 ; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 304
-; RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v8, v8, v16
-; RV64V-NEXT:    vsrl.vi v16, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, a7
-; RV64V-NEXT:    vand.vx v16, v16, a7
+; RV64V-NEXT:    addi a1, a1, 96
+; RV64V-NEXT:    vl1r.v v13, (a1) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v13
+; RV64V-NEXT:    vxor.vv v10, v10, v1
+; RV64V-NEXT:    vxor.vv v10, v10, v5
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vx v8, v8, a0
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vor.vv v8, v11, v8
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, t5
+; RV64V-NEXT:    vand.vx v9, v9, t5
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, t6
+; RV64V-NEXT:    vand.vx v9, v9, t6
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, s0
+; RV64V-NEXT:    vand.vx v9, v9, s0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s2, 176(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s3, 168(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s4, 160(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s5, 152(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s6, 144(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s7, 136(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s8, 128(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s9, 120(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s10, 112(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s11, 104(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    addi sp, sp, 208
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv1i64_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv1i64_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v9
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
+  %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
+  %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 1 x i128> %res.ext to <vscale x 1 x i64>
+  ret <vscale x 1 x i64> %res
+}
+
+define <vscale x 1 x i64> @clmulh_nxv1i64_vx(<vscale x 1 x i64> %va, i64 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv1i64_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -368
+; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    sub sp, sp, a2
+; RV32V-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV32V-NEXT:    vmv1r.v v9, v8
+; RV32V-NEXT:    sw a0, 16(sp)
+; RV32V-NEXT:    sw a1, 20(sp)
+; RV32V-NEXT:    addi t6, sp, 16
+; RV32V-NEXT:    lui s0, 1044480
+; RV32V-NEXT:    lui a4, 524288
+; RV32V-NEXT:    li s11, 1
+; RV32V-NEXT:    li s6, 2
+; RV32V-NEXT:    li s7, 4
+; RV32V-NEXT:    li s8, 8
+; RV32V-NEXT:    li s10, 16
+; RV32V-NEXT:    li s9, 32
+; RV32V-NEXT:    li s1, 64
+; RV32V-NEXT:    li s2, 128
+; RV32V-NEXT:    li s3, 256
+; RV32V-NEXT:    li s4, 512
+; RV32V-NEXT:    li s5, 1024
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui ra, 2
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    lui a1, 8
+; RV32V-NEXT:    lui a5, 16
+; RV32V-NEXT:    lui a2, 32
+; RV32V-NEXT:    lui a6, 64
+; RV32V-NEXT:    lui a7, 128
+; RV32V-NEXT:    lui t0, 256
+; RV32V-NEXT:    lui t1, 512
+; RV32V-NEXT:    lui t2, 1024
+; RV32V-NEXT:    lui t3, 2048
+; RV32V-NEXT:    lui t4, 4096
+; RV32V-NEXT:    lui t5, 8192
+; RV32V-NEXT:    vlse64.v v13, (t6), zero
+; RV32V-NEXT:    lui t6, 16384
+; RV32V-NEXT:    sw s0, 264(sp)
+; RV32V-NEXT:    lui s0, 32768
+; RV32V-NEXT:    sw zero, 268(sp)
+; RV32V-NEXT:    sw a4, 24(sp)
+; RV32V-NEXT:    sw zero, 28(sp)
+; RV32V-NEXT:    sw zero, 288(sp)
+; RV32V-NEXT:    sw s11, 292(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s6, 276(sp)
+; RV32V-NEXT:    lui s6, 65536
+; RV32V-NEXT:    sw zero, 280(sp)
+; RV32V-NEXT:    sw s7, 284(sp)
+; RV32V-NEXT:    lui s7, 131072
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s8, 260(sp)
+; RV32V-NEXT:    lui s8, 262144
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw s10, 252(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw s9, 244(sp)
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw s1, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s2, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s3, 220(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s4, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s5, 204(sp)
+; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s11, 196(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw a3, 188(sp)
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw ra, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw a0, 172(sp)
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw a1, 164(sp)
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw a5, 156(sp)
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw a2, 148(sp)
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw a6, 140(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw a7, 132(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t0, 124(sp)
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t1, 116(sp)
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw t2, 108(sp)
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw t3, 100(sp)
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw t4, 92(sp)
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw t5, 84(sp)
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw t6, 76(sp)
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw s0, 68(sp)
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw s6, 60(sp)
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw s7, 52(sp)
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw s8, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw a4, 36(sp)
+; RV32V-NEXT:    lui a0, 61681
+; RV32V-NEXT:    addi a0, a0, -241
+; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v14, a0
+; RV32V-NEXT:    lui a0, 209715
+; RV32V-NEXT:    addi a0, a0, 819
+; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    lui a0, 349525
+; RV32V-NEXT:    addi a0, a0, 1365
+; RV32V-NEXT:    vmv.v.x v11, a0
+; RV32V-NEXT:    addi a0, sp, 264
+; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 24
+; RV32V-NEXT:    vlse64.v v7, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vlse64.v v31, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 272
+; RV32V-NEXT:    vlse64.v v30, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 280
+; RV32V-NEXT:    vlse64.v v29, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 256
+; RV32V-NEXT:    vlse64.v v26, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 248
+; RV32V-NEXT:    vlse64.v v23, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 240
+; RV32V-NEXT:    vlse64.v v20, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 232
+; RV32V-NEXT:    vlse64.v v17, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 224
+; RV32V-NEXT:    vlse64.v v15, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 216
+; RV32V-NEXT:    vlse64.v v8, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 304
+; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 208
+; RV32V-NEXT:    vlse64.v v18, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 200
+; RV32V-NEXT:    vlse64.v v16, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 192
+; RV32V-NEXT:    vlse64.v v19, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 184
+; RV32V-NEXT:    vlse64.v v21, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 176
+; RV32V-NEXT:    vlse64.v v22, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 168
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 160
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 152
+; RV32V-NEXT:    vlse64.v v25, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 144
+; RV32V-NEXT:    vlse64.v v27, (a0), zero
+; RV32V-NEXT:    li ra, 56
+; RV32V-NEXT:    vsrl.vi v6, v9, 24
+; RV32V-NEXT:    vsrl.vi v5, v9, 8
+; RV32V-NEXT:    vsrl.vx v4, v9, ra
+; RV32V-NEXT:    li a2, 40
+; RV32V-NEXT:    vsrl.vx v3, v9, a2
+; RV32V-NEXT:    lui a1, 4080
+; RV32V-NEXT:    vand.vx v6, v6, a1
+; RV32V-NEXT:    vsll.vx v2, v9, ra
+; RV32V-NEXT:    addi a0, a5, -256
+; RV32V-NEXT:    vand.vx v3, v3, a0
+; RV32V-NEXT:    vand.vx v1, v9, a0
+; RV32V-NEXT:    vor.vv v3, v3, v4
+; RV32V-NEXT:    vsll.vx v4, v1, a2
+; RV32V-NEXT:    vor.vv v2, v2, v4
+; RV32V-NEXT:    vsrl.vx v4, v13, ra
+; RV32V-NEXT:    vsrl.vx v1, v13, a2
+; RV32V-NEXT:    vsll.vx v0, v13, ra
+; RV32V-NEXT:    vand.vx v1, v1, a0
+; RV32V-NEXT:    vor.vv v4, v1, v4
+; RV32V-NEXT:    vand.vx v1, v13, a0
+; RV32V-NEXT:    vsll.vx v1, v1, a2
+; RV32V-NEXT:    vor.vv v1, v0, v1
+; RV32V-NEXT:    vsrl.vi v0, v13, 24
+; RV32V-NEXT:    vand.vv v5, v5, v12
+; RV32V-NEXT:    vor.vv v8, v5, v6
+; RV32V-NEXT:    vsrl.vi v6, v13, 8
+; RV32V-NEXT:    vand.vx v5, v0, a1
+; RV32V-NEXT:    vand.vv v6, v6, v12
+; RV32V-NEXT:    vor.vv v5, v6, v5
+; RV32V-NEXT:    addi a3, sp, 136
+; RV32V-NEXT:    vlse64.v v6, (a3), zero
+; RV32V-NEXT:    vor.vv v0, v5, v4
+; RV32V-NEXT:    vand.vx v5, v13, a1
+; RV32V-NEXT:    vsll.vi v5, v5, 24
+; RV32V-NEXT:    vand.vv v13, v13, v12
+; RV32V-NEXT:    vsll.vi v13, v13, 8
+; RV32V-NEXT:    vor.vv v13, v5, v13
+; RV32V-NEXT:    addi a3, sp, 128
+; RV32V-NEXT:    vlse64.v v5, (a3), zero
+; RV32V-NEXT:    vor.vv v13, v1, v13
+; RV32V-NEXT:    addi a3, sp, 120
+; RV32V-NEXT:    vlse64.v v4, (a3), zero
+; RV32V-NEXT:    vor.vv v13, v13, v0
+; RV32V-NEXT:    vsrl.vi v1, v13, 4
+; RV32V-NEXT:    vand.vv v13, v13, v14
+; RV32V-NEXT:    vand.vv v1, v1, v14
+; RV32V-NEXT:    vsll.vi v13, v13, 4
+; RV32V-NEXT:    vor.vv v13, v1, v13
+; RV32V-NEXT:    vsrl.vi v1, v13, 2
+; RV32V-NEXT:    vand.vv v13, v13, v10
+; RV32V-NEXT:    vand.vv v1, v1, v10
+; RV32V-NEXT:    vsll.vi v13, v13, 2
+; RV32V-NEXT:    vor.vv v13, v1, v13
+; RV32V-NEXT:    vsrl.vi v1, v13, 1
+; RV32V-NEXT:    vand.vv v13, v13, v11
+; RV32V-NEXT:    vand.vv v1, v1, v11
+; RV32V-NEXT:    vadd.vv v13, v13, v13
+; RV32V-NEXT:    vor.vv v13, v1, v13
+; RV32V-NEXT:    vand.vx v1, v13, s10
+; RV32V-NEXT:    vor.vv v8, v8, v3
+; RV32V-NEXT:    vand.vx v3, v9, a1
+; RV32V-NEXT:    vsll.vi v3, v3, 24
+; RV32V-NEXT:    vand.vv v9, v9, v12
+; RV32V-NEXT:    vsll.vi v9, v9, 8
+; RV32V-NEXT:    vor.vv v9, v3, v9
+; RV32V-NEXT:    vand.vx v3, v13, s9
+; RV32V-NEXT:    vor.vv v9, v2, v9
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 4
+; RV32V-NEXT:    vand.vv v8, v8, v14
+; RV32V-NEXT:    vand.vv v9, v9, v14
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v10
+; RV32V-NEXT:    vand.vv v9, v9, v10
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v11
+; RV32V-NEXT:    vand.vv v9, v9, v11
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vand.vi v9, v13, 2
+; RV32V-NEXT:    vand.vi v2, v13, 1
+; RV32V-NEXT:    vmul.vv v9, v8, v9
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v2, v9
+; RV32V-NEXT:    vand.vi v2, v13, 4
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    vand.vi v2, v13, 8
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    vand.vx v2, v13, s1
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, s2
+; RV32V-NEXT:    vmul.vv v3, v8, v3
+; RV32V-NEXT:    vxor.vv v9, v9, v3
+; RV32V-NEXT:    vand.vx v0, v13, s3
+; RV32V-NEXT:    vmul.vv v3, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v3
+; RV32V-NEXT:    vand.vx v2, v13, s4
+; RV32V-NEXT:    vmul.vv v3, v8, v1
+; RV32V-NEXT:    vxor.vv v3, v9, v3
+; RV32V-NEXT:    vand.vx v9, v13, s5
+; RV32V-NEXT:    vmul.vv v1, v8, v0
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v1, v3, v1
+; RV32V-NEXT:    vxor.vv v2, v1, v2
+; RV32V-NEXT:    vand.vx v1, v13, s11
+; RV32V-NEXT:    vmul.vv v9, v8, v9
+; RV32V-NEXT:    vxor.vv v9, v2, v9
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    vand.vx v2, v13, a3
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    lui a3, 2
+; RV32V-NEXT:    vand.vx v1, v13, a3
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    lui a3, 4
+; RV32V-NEXT:    vand.vx v2, v13, a3
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    lui a3, 8
+; RV32V-NEXT:    vand.vx v1, v13, a3
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    vand.vx v2, v13, a5
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    lui a3, 32
+; RV32V-NEXT:    vand.vx v1, v13, a3
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    vand.vx v2, v13, a6
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, a7
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    vand.vx v2, v13, t0
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, t1
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    vand.vx v2, v13, t2
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, t3
+; RV32V-NEXT:    vmul.vv v2, v8, v2
+; RV32V-NEXT:    vxor.vv v9, v9, v2
+; RV32V-NEXT:    vand.vx v0, v13, t4
+; RV32V-NEXT:    vmul.vv v2, v8, v1
+; RV32V-NEXT:    vxor.vv v2, v9, v2
+; RV32V-NEXT:    vand.vx v9, v13, t5
+; RV32V-NEXT:    vmul.vv v1, v8, v0
+; RV32V-NEXT:    vmul.vv v9, v8, v9
+; RV32V-NEXT:    vxor.vv v1, v2, v1
+; RV32V-NEXT:    vxor.vv v9, v1, v9
+; RV32V-NEXT:    vand.vx v1, v13, t6
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, s0
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, s6
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, s7
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vand.vx v1, v13, s8
+; RV32V-NEXT:    addi a3, sp, 112
+; RV32V-NEXT:    vmul.vv v1, v8, v1
+; RV32V-NEXT:    vxor.vv v9, v9, v1
+; RV32V-NEXT:    vlse64.v v1, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 104
+; RV32V-NEXT:    vand.vv v7, v13, v7
+; RV32V-NEXT:    vmul.vv v7, v8, v7
+; RV32V-NEXT:    vxor.vv v9, v9, v7
+; RV32V-NEXT:    vlse64.v v7, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 96
+; RV32V-NEXT:    vand.vv v31, v13, v31
+; RV32V-NEXT:    vmul.vv v31, v8, v31
+; RV32V-NEXT:    vxor.vv v9, v9, v31
+; RV32V-NEXT:    vlse64.v v31, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 88
+; RV32V-NEXT:    vand.vv v30, v13, v30
+; RV32V-NEXT:    vmul.vv v30, v8, v30
+; RV32V-NEXT:    vxor.vv v9, v9, v30
+; RV32V-NEXT:    vlse64.v v30, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 80
+; RV32V-NEXT:    vand.vv v29, v13, v29
+; RV32V-NEXT:    vmul.vv v29, v8, v29
+; RV32V-NEXT:    vxor.vv v9, v9, v29
+; RV32V-NEXT:    vlse64.v v29, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 72
+; RV32V-NEXT:    vand.vv v26, v13, v26
+; RV32V-NEXT:    vmul.vv v26, v8, v26
+; RV32V-NEXT:    vxor.vv v9, v9, v26
+; RV32V-NEXT:    vlse64.v v26, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 64
+; RV32V-NEXT:    vand.vv v23, v13, v23
+; RV32V-NEXT:    vmul.vv v23, v8, v23
+; RV32V-NEXT:    vxor.vv v9, v9, v23
+; RV32V-NEXT:    vlse64.v v23, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 56
+; RV32V-NEXT:    vand.vv v20, v13, v20
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v9, v9, v20
+; RV32V-NEXT:    vlse64.v v20, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 48
+; RV32V-NEXT:    vand.vv v17, v13, v17
+; RV32V-NEXT:    vmul.vv v17, v8, v17
+; RV32V-NEXT:    vxor.vv v9, v9, v17
+; RV32V-NEXT:    vlse64.v v17, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 40
+; RV32V-NEXT:    vand.vv v15, v13, v15
+; RV32V-NEXT:    vmul.vv v15, v8, v15
+; RV32V-NEXT:    vxor.vv v15, v9, v15
+; RV32V-NEXT:    vlse64.v v0, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 32
+; RV32V-NEXT:    addi a4, sp, 304
+; RV32V-NEXT:    vl1r.v v9, (a4) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vand.vv v9, v13, v9
+; RV32V-NEXT:    vand.vv v18, v13, v18
+; RV32V-NEXT:    vmul.vv v9, v8, v9
+; RV32V-NEXT:    vmul.vv v18, v8, v18
+; RV32V-NEXT:    vxor.vv v9, v15, v9
+; RV32V-NEXT:    vxor.vv v9, v9, v18
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v16, v13, v16
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v9, v9, v16
+; RV32V-NEXT:    vand.vv v16, v13, v19
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v9, v9, v16
+; RV32V-NEXT:    vand.vv v16, v13, v21
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v9, v9, v16
+; RV32V-NEXT:    vand.vv v16, v13, v22
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vxor.vv v9, v9, v16
+; RV32V-NEXT:    vand.vv v16, v13, v24
+; RV32V-NEXT:    vand.vv v19, v13, v28
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v19, v8, v19
+; RV32V-NEXT:    vxor.vv v9, v9, v16
+; RV32V-NEXT:    vand.vx v16, v15, a1
+; RV32V-NEXT:    vxor.vv v9, v9, v19
+; RV32V-NEXT:    vsrl.vi v19, v9, 24
+; RV32V-NEXT:    vand.vx v19, v19, a1
+; RV32V-NEXT:    vand.vv v21, v13, v25
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vand.vv v21, v13, v27
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vand.vv v21, v13, v6
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vand.vv v21, v13, v5
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vand.vv v21, v13, v4
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vand.vv v21, v13, v1
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vand.vv v21, v13, v7
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vand.vv v21, v13, v31
+; RV32V-NEXT:    vmul.vv v21, v8, v21
+; RV32V-NEXT:    vand.vx v22, v2, a0
+; RV32V-NEXT:    vsll.vx v22, v22, a2
+; RV32V-NEXT:    vxor.vv v9, v9, v21
+; RV32V-NEXT:    vsrl.vx v21, v9, a2
+; RV32V-NEXT:    vand.vx v21, v21, a0
+; RV32V-NEXT:    vand.vv v24, v13, v30
+; RV32V-NEXT:    vand.vv v25, v13, v29
+; RV32V-NEXT:    vand.vv v26, v13, v26
+; RV32V-NEXT:    vand.vv v23, v13, v23
+; RV32V-NEXT:    vand.vv v20, v13, v20
+; RV32V-NEXT:    vand.vv v17, v13, v17
+; RV32V-NEXT:    vand.vv v27, v13, v0
+; RV32V-NEXT:    vand.vv v13, v13, v18
+; RV32V-NEXT:    vmul.vv v18, v8, v24
+; RV32V-NEXT:    vmul.vv v24, v8, v25
+; RV32V-NEXT:    vmul.vv v25, v8, v26
+; RV32V-NEXT:    vmul.vv v23, v8, v23
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vmul.vv v17, v8, v17
+; RV32V-NEXT:    vmul.vv v26, v8, v27
+; RV32V-NEXT:    vmul.vv v8, v8, v13
+; RV32V-NEXT:    vxor.vv v13, v9, v18
+; RV32V-NEXT:    vxor.vv v13, v13, v24
+; RV32V-NEXT:    vxor.vv v13, v13, v25
+; RV32V-NEXT:    vxor.vv v13, v13, v23
+; RV32V-NEXT:    vxor.vv v13, v13, v20
+; RV32V-NEXT:    vxor.vv v13, v13, v17
+; RV32V-NEXT:    vxor.vv v13, v13, v26
+; RV32V-NEXT:    vxor.vv v8, v13, v8
+; RV32V-NEXT:    vsll.vx v13, v3, ra
+; RV32V-NEXT:    vsrl.vx v8, v8, ra
+; RV32V-NEXT:    vor.vv v13, v13, v22
+; RV32V-NEXT:    vsrl.vi v15, v15, 8
+; RV32V-NEXT:    vsll.vi v16, v16, 24
+; RV32V-NEXT:    vand.vv v15, v15, v12
+; RV32V-NEXT:    vor.vv v15, v15, v19
+; RV32V-NEXT:    vand.vv v9, v9, v12
+; RV32V-NEXT:    vsll.vi v9, v9, 8
+; RV32V-NEXT:    vor.vv v9, v16, v9
+; RV32V-NEXT:    vor.vv v9, v13, v9
+; RV32V-NEXT:    vor.vv v8, v21, v8
+; RV32V-NEXT:    vor.vv v8, v15, v8
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 4
+; RV32V-NEXT:    vand.vv v8, v8, v14
+; RV32V-NEXT:    vand.vv v9, v9, v14
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v10
+; RV32V-NEXT:    vand.vv v9, v9, v10
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v9, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v11
+; RV32V-NEXT:    vand.vv v9, v9, v11
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v9, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 368
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv1i64_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a1, 56
+; RV64V-NEXT:    lui t2, 16
+; RV64V-NEXT:    lui a2, 4080
+; RV64V-NEXT:    li t0, 255
+; RV64V-NEXT:    lui a3, 61681
+; RV64V-NEXT:    lui a4, 209715
+; RV64V-NEXT:    lui a5, 349525
+; RV64V-NEXT:    srli a6, a0, 24
+; RV64V-NEXT:    srli a7, a0, 8
+; RV64V-NEXT:    srli t1, a0, 40
+; RV64V-NEXT:    srli t3, a0, 56
+; RV64V-NEXT:    addi a3, a3, -241
+; RV64V-NEXT:    addi a4, a4, 819
+; RV64V-NEXT:    addi t4, a5, 1365
+; RV64V-NEXT:    slli a5, a3, 32
+; RV64V-NEXT:    add a5, a3, a5
+; RV64V-NEXT:    slli a3, a4, 32
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, t4, 32
+; RV64V-NEXT:    add a3, t4, a3
+; RV64V-NEXT:    srliw t4, a0, 24
+; RV64V-NEXT:    slli t0, t0, 24
+; RV64V-NEXT:    and a6, a6, a2
+; RV64V-NEXT:    and a7, a7, t0
+; RV64V-NEXT:    or t5, a7, a6
+; RV64V-NEXT:    addi a6, t2, -256
+; RV64V-NEXT:    and a7, t1, a6
+; RV64V-NEXT:    or t1, a7, t3
+; RV64V-NEXT:    and a7, a0, a2
+; RV64V-NEXT:    slli t4, t4, 32
+; RV64V-NEXT:    slli a7, a7, 24
+; RV64V-NEXT:    or t3, a7, t4
+; RV64V-NEXT:    li a7, 40
+; RV64V-NEXT:    vsetvli t4, zero, e64, m1, ta, ma
+; RV64V-NEXT:    vsrl.vi v10, v8, 24
+; RV64V-NEXT:    vsrl.vi v9, v8, 8
+; RV64V-NEXT:    or t1, t5, t1
+; RV64V-NEXT:    slli t4, a0, 56
+; RV64V-NEXT:    and a0, a0, a6
+; RV64V-NEXT:    slli a0, a0, 40
+; RV64V-NEXT:    or t4, t4, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    or t4, t4, t3
+; RV64V-NEXT:    lui t3, 1
+; RV64V-NEXT:    vsrl.vx v11, v8, a1
+; RV64V-NEXT:    vsrl.vx v12, v8, a7
+; RV64V-NEXT:    vand.vx v10, v10, a2
+; RV64V-NEXT:    vand.vx v13, v8, a2
+; RV64V-NEXT:    vsll.vx v14, v8, a1
+; RV64V-NEXT:    vand.vx v12, v12, a6
+; RV64V-NEXT:    vand.vx v9, v9, t0
+; RV64V-NEXT:    vsll.vi v13, v13, 24
+; RV64V-NEXT:    vor.vv v11, v12, v11
+; RV64V-NEXT:    vand.vx v12, v8, t0
+; RV64V-NEXT:    vand.vx v8, v8, a6
+; RV64V-NEXT:    vor.vv v9, v9, v10
+; RV64V-NEXT:    vsll.vi v10, v12, 8
+; RV64V-NEXT:    vsll.vx v8, v8, a7
+; RV64V-NEXT:    vor.vv v9, v9, v11
+; RV64V-NEXT:    vor.vv v10, v13, v10
+; RV64V-NEXT:    vor.vv v8, v14, v8
+; RV64V-NEXT:    vor.vv v8, v8, v10
+; RV64V-NEXT:    vor.vv v8, v8, v9
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v9, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a5
+; RV64V-NEXT:    srli t4, t1, 4
+; RV64V-NEXT:    and t1, t1, a5
+; RV64V-NEXT:    vand.vx v9, v9, a5
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    and t4, t4, a5
+; RV64V-NEXT:    slli t1, t1, 4
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v9, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a4
+; RV64V-NEXT:    srli t4, t1, 2
+; RV64V-NEXT:    and t1, t1, a4
+; RV64V-NEXT:    vand.vx v9, v9, a4
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    and t4, t4, a4
+; RV64V-NEXT:    slli t1, t1, 2
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v9, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    srli t4, t1, 1
+; RV64V-NEXT:    and t1, t1, a3
+; RV64V-NEXT:    vand.vx v9, v9, a3
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    and t4, t4, a3
+; RV64V-NEXT:    slli t1, t1, 1
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    andi t4, t1, 2
+; RV64V-NEXT:    vmul.vx v9, v8, t4
+; RV64V-NEXT:    andi t4, t1, 1
+; RV64V-NEXT:    vmul.vx v10, v8, t4
+; RV64V-NEXT:    andi t4, t1, 4
+; RV64V-NEXT:    vmul.vx v11, v8, t4
+; RV64V-NEXT:    andi t4, t1, 8
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    andi t4, t1, 16
+; RV64V-NEXT:    vmul.vx v13, v8, t4
+; RV64V-NEXT:    andi t4, t1, 32
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    andi t4, t1, 64
+; RV64V-NEXT:    vxor.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vx v10, v8, t4
+; RV64V-NEXT:    andi t4, t1, 128
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vx v11, v8, t4
+; RV64V-NEXT:    andi t4, t1, 256
+; RV64V-NEXT:    vxor.vv v9, v9, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    andi t4, t1, 512
+; RV64V-NEXT:    vxor.vv v9, v9, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t4
+; RV64V-NEXT:    andi t4, t1, 1024
+; RV64V-NEXT:    vxor.vv v9, v9, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    slli t4, a0, 11
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    vmul.vx v10, v8, t4
+; RV64V-NEXT:    lui t4, 2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vx v11, v8, t3
+; RV64V-NEXT:    lui t3, 4
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v9, v12
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t4
+; RV64V-NEXT:    lui t4, 8
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    lui t3, 32
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v12, v10
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    lui t4, 64
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vx v11, v8, t2
+; RV64V-NEXT:    lui t2, 128
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t3
+; RV64V-NEXT:    lui t3, 256
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    lui t4, 512
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v10, v10, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t2
+; RV64V-NEXT:    lui t2, 1024
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vx v11, v8, t3
+; RV64V-NEXT:    lui t3, 2048
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t4
+; RV64V-NEXT:    lui t4, 4096
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t2
+; RV64V-NEXT:    lui t2, 8192
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t3
+; RV64V-NEXT:    lui t3, 16384
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vx v11, v8, t4
+; RV64V-NEXT:    lui t4, 32768
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v10, v10, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t2
+; RV64V-NEXT:    lui t2, 65536
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    lui t3, 131072
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    lui t4, 262144
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v11, v11, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t2
+; RV64V-NEXT:    slli t2, a0, 32
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vsll.vx v9, v9, a1
+; RV64V-NEXT:    vand.vx v10, v10, a6
+; RV64V-NEXT:    vsll.vx v10, v10, a7
+; RV64V-NEXT:    vor.vv v9, v9, v10
+; RV64V-NEXT:    vmul.vx v10, v8, t3
+; RV64V-NEXT:    slli t3, a0, 33
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v11, v11, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    slli t4, a0, 34
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t2
+; RV64V-NEXT:    slli t2, a0, 35
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v11, v11, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t3
+; RV64V-NEXT:    slli t3, a0, 36
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vmul.vx v11, v8, t4
+; RV64V-NEXT:    srliw t4, t1, 31
+; RV64V-NEXT:    slli t4, t4, 31
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    slli t4, a0, 37
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t2
+; RV64V-NEXT:    slli t2, a0, 38
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t3
+; RV64V-NEXT:    slli t3, a0, 39
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t4
+; RV64V-NEXT:    slli t4, a0, 40
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vx v11, v8, t2
+; RV64V-NEXT:    slli t2, a0, 41
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    slli t3, a0, 42
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    slli t4, a0, 43
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v10, v10, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t2
+; RV64V-NEXT:    slli t2, a0, 44
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vx v11, v8, t3
+; RV64V-NEXT:    slli t3, a0, 45
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    slli t4, a0, 46
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v10, v12
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t2
+; RV64V-NEXT:    slli t2, a0, 47
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v11, v12, v11
+; RV64V-NEXT:    vmul.vx v12, v8, t3
+; RV64V-NEXT:    slli t3, a0, 48
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v11, v11, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    slli t4, a0, 49
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v11, v11, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t2
+; RV64V-NEXT:    slli t2, a0, 50
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t3
+; RV64V-NEXT:    slli t3, a0, 51
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v11, v11, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    slli t4, a0, 52
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v11, v11, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t2
+; RV64V-NEXT:    slli t2, a0, 53
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v11, v12
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    slli t3, a0, 54
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vsrl.vi v13, v10, 8
+; RV64V-NEXT:    vand.vx v13, v13, t0
+; RV64V-NEXT:    vsrl.vi v11, v11, 24
+; RV64V-NEXT:    vand.vx v11, v11, a2
+; RV64V-NEXT:    vor.vv v11, v13, v11
+; RV64V-NEXT:    vmul.vx v13, v8, t4
+; RV64V-NEXT:    slli t4, a0, 55
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t2
+; RV64V-NEXT:    slli t2, a0, 56
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t3
+; RV64V-NEXT:    slli t3, a0, 57
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    slli t4, a0, 58
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vmul.vx v13, v8, t2
+; RV64V-NEXT:    slli t2, a0, 59
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    slli t3, a0, 60
+; RV64V-NEXT:    vand.vx v10, v10, a2
+; RV64V-NEXT:    slli a2, a0, 61
+; RV64V-NEXT:    slli a0, a0, 62
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    and a2, t1, a2
+; RV64V-NEXT:    and a0, t1, a0
+; RV64V-NEXT:    srli t1, t1, 63
+; RV64V-NEXT:    vsll.vi v10, v10, 24
+; RV64V-NEXT:    vxor.vv v13, v12, v13
+; RV64V-NEXT:    vxor.vv v13, v13, v14
+; RV64V-NEXT:    vand.vx v14, v12, t0
+; RV64V-NEXT:    vsll.vi v14, v14, 8
+; RV64V-NEXT:    vor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    vxor.vv v13, v13, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t2
+; RV64V-NEXT:    vor.vv v9, v9, v10
+; RV64V-NEXT:    vmul.vx v10, v8, t3
+; RV64V-NEXT:    vxor.vv v13, v13, v14
+; RV64V-NEXT:    vmul.vx v14, v8, a2
+; RV64V-NEXT:    vxor.vv v10, v13, v10
+; RV64V-NEXT:    vmul.vx v13, v8, a0
+; RV64V-NEXT:    slli t1, t1, 63
+; RV64V-NEXT:    vmul.vx v8, v8, t1
+; RV64V-NEXT:    vsrl.vx v12, v12, a7
+; RV64V-NEXT:    vand.vx v12, v12, a6
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vxor.vv v10, v10, v13
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vx v8, v8, a1
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vor.vv v8, v11, v8
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a5
+; RV64V-NEXT:    vand.vx v9, v9, a5
 ; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, t0
-; RV64V-NEXT:    vand.vx v16, v16, t0
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a4
+; RV64V-NEXT:    vand.vx v9, v9, a4
 ; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vand.vx v9, v9, a3
 ; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vand.vx v16, v0, a3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 47
-; RV64V-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 48
-; RV64V-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 49
-; RV64V-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 50
-; RV64V-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 51
-; RV64V-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 52
-; RV64V-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 53
-; RV64V-NEXT:    sd a0, 64(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 54
-; RV64V-NEXT:    sd a0, 56(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 55
-; RV64V-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 56
-; RV64V-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli a0, t2, 57
-; RV64V-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli ra, t2, 58
-; RV64V-NEXT:    slli s10, t2, 59
-; RV64V-NEXT:    slli s8, t2, 60
-; RV64V-NEXT:    slli a0, t2, 61
-; RV64V-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    slli s11, t2, 62
-; RV64V-NEXT:    li a1, -1
-; RV64V-NEXT:    slli s9, a1, 63
-; RV64V-NEXT:    li a0, 64
-; RV64V-NEXT:    li a1, 128
-; RV64V-NEXT:    li a2, 256
-; RV64V-NEXT:    li a3, 512
-; RV64V-NEXT:    li a4, 1024
-; RV64V-NEXT:    lui a5, 1
-; RV64V-NEXT:    lui a6, 2
-; RV64V-NEXT:    lui a7, 4
-; RV64V-NEXT:    lui t0, 8
-; RV64V-NEXT:    lui t1, 32
-; RV64V-NEXT:    lui t2, 64
-; RV64V-NEXT:    lui t3, 128
-; RV64V-NEXT:    lui t4, 256
-; RV64V-NEXT:    lui t5, 512
-; RV64V-NEXT:    lui t6, 1024
-; RV64V-NEXT:    lui s0, 2048
-; RV64V-NEXT:    lui s1, 4096
-; RV64V-NEXT:    lui s2, 8192
-; RV64V-NEXT:    lui s3, 16384
-; RV64V-NEXT:    lui s4, 32768
-; RV64V-NEXT:    lui s5, 65536
-; RV64V-NEXT:    lui s6, 131072
-; RV64V-NEXT:    lui s7, 262144
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    mv s8, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add s8, s8, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, s8
-; RV64V-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, a1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, a2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, a3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, a4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, a5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, a6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, a7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, t0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    lui a0, 16
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, t1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, t2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, t3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, t4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, t5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, t6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 9
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s2
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s3
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s4
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s5
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s6
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv1i64_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    addi sp, sp, -16
+; RV32ZVBC-NEXT:    sw a0, 8(sp)
+; RV32ZVBC-NEXT:    sw a1, 12(sp)
+; RV32ZVBC-NEXT:    addi a0, sp, 8
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vlse64.v v9, (a0), zero
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9
+; RV32ZVBC-NEXT:    addi sp, sp, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv1i64_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i128 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
+  %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
+  %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 1 x i128> %res.ext to <vscale x 1 x i64>
+  ret <vscale x 1 x i64> %res
+}
+
+define <vscale x 2 x i64> @clmulh_nxv2i64_vv(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv2i64_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -352
+; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    lui s8, 1044480
+; RV32V-NEXT:    lui t1, 524288
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    li s11, 2
+; RV32V-NEXT:    li ra, 4
+; RV32V-NEXT:    li a7, 8
+; RV32V-NEXT:    li t0, 16
+; RV32V-NEXT:    li a6, 32
+; RV32V-NEXT:    li a5, 64
+; RV32V-NEXT:    li s10, 128
+; RV32V-NEXT:    li a4, 256
+; RV32V-NEXT:    li a3, 512
+; RV32V-NEXT:    li a2, 1024
+; RV32V-NEXT:    lui t3, 1
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    lui s9, 4
+; RV32V-NEXT:    lui t2, 8
+; RV32V-NEXT:    lui t4, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui t6, 64
+; RV32V-NEXT:    lui s0, 128
+; RV32V-NEXT:    lui s1, 256
+; RV32V-NEXT:    lui s2, 512
+; RV32V-NEXT:    lui s3, 1024
+; RV32V-NEXT:    lui s4, 2048
+; RV32V-NEXT:    lui s5, 4096
+; RV32V-NEXT:    lui s6, 8192
+; RV32V-NEXT:    lui s7, 16384
+; RV32V-NEXT:    sw s8, 248(sp)
+; RV32V-NEXT:    lui s8, 32768
+; RV32V-NEXT:    sw zero, 252(sp)
+; RV32V-NEXT:    sw t1, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw a1, 276(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s11, 260(sp)
+; RV32V-NEXT:    lui s11, 65536
+; RV32V-NEXT:    sw zero, 264(sp)
+; RV32V-NEXT:    sw ra, 268(sp)
+; RV32V-NEXT:    lui ra, 131072
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw a7, 244(sp)
+; RV32V-NEXT:    lui a7, 262144
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw t0, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw a6, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw a5, 220(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s10, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw a4, 204(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw a3, 196(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw a2, 188(sp)
+; RV32V-NEXT:    slli a5, a1, 11
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw a5, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw t3, 172(sp)
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw a0, 164(sp)
+; RV32V-NEXT:    lui a6, 2
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s9, 156(sp)
+; RV32V-NEXT:    lui t0, 4
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw t2, 148(sp)
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw t4, 140(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t5, 132(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t6, 124(sp)
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw s0, 116(sp)
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw s1, 108(sp)
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw s2, 100(sp)
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw s3, 92(sp)
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw s4, 84(sp)
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw s5, 76(sp)
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw s6, 68(sp)
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw s7, 60(sp)
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw s8, 52(sp)
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw s11, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw ra, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw a7, 28(sp)
+; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    sw t1, 20(sp)
+; RV32V-NEXT:    lui a0, 61681
+; RV32V-NEXT:    addi a0, a0, -241
+; RV32V-NEXT:    vsetvli t3, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v2, a0
+; RV32V-NEXT:    lui a0, 209715
+; RV32V-NEXT:    addi a0, a0, 819
+; RV32V-NEXT:    vmv.v.x v0, a0
+; RV32V-NEXT:    lui a0, 349525
+; RV32V-NEXT:    addi a0, a0, 1365
+; RV32V-NEXT:    vmv.v.x v12, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 248
+; RV32V-NEXT:    vsetvli t3, zero, e64, m2, ta, ma
+; RV32V-NEXT:    vlse64.v v4, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 8
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 272
+; RV32V-NEXT:    vlse64.v v14, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 256
+; RV32V-NEXT:    vlse64.v v16, (a0), zero
+; RV32V-NEXT:    li t3, 56
+; RV32V-NEXT:    vsrl.vi v18, v8, 24
+; RV32V-NEXT:    vsrl.vi v20, v8, 8
+; RV32V-NEXT:    vsrl.vx v22, v8, t3
+; RV32V-NEXT:    li s9, 40
+; RV32V-NEXT:    vsrl.vx v24, v8, s9
+; RV32V-NEXT:    vsll.vx v26, v8, t3
+; RV32V-NEXT:    vsrl.vx v28, v10, t3
+; RV32V-NEXT:    vsrl.vx v30, v10, s9
+; RV32V-NEXT:    addi s10, t4, -256
+; RV32V-NEXT:    vand.vx v24, v24, s10
+; RV32V-NEXT:    vor.vv v22, v24, v22
+; RV32V-NEXT:    vsll.vx v6, v10, t3
+; RV32V-NEXT:    vand.vx v24, v30, s10
+; RV32V-NEXT:    vor.vv v30, v24, v28
+; RV32V-NEXT:    vand.vx v24, v8, s10
+; RV32V-NEXT:    vsll.vx v24, v24, s9
+; RV32V-NEXT:    vor.vv v24, v26, v24
+; RV32V-NEXT:    vand.vx v26, v10, s10
+; RV32V-NEXT:    vsll.vx v26, v26, s9
+; RV32V-NEXT:    vor.vv v26, v6, v26
+; RV32V-NEXT:    vsrl.vi v28, v10, 24
+; RV32V-NEXT:    lui a4, 4080
+; RV32V-NEXT:    vand.vx v18, v18, a4
+; RV32V-NEXT:    vand.vv v20, v20, v4
+; RV32V-NEXT:    vor.vv v20, v20, v18
+; RV32V-NEXT:    vsrl.vi v18, v10, 8
+; RV32V-NEXT:    vand.vx v28, v28, a4
+; RV32V-NEXT:    vand.vv v18, v18, v4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vor.vv v6, v18, v28
+; RV32V-NEXT:    addi a3, sp, 264
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vor.vv v28, v20, v22
+; RV32V-NEXT:    vand.vx v20, v8, a4
+; RV32V-NEXT:    vsll.vi v20, v20, 24
+; RV32V-NEXT:    vand.vv v8, v8, v4
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v20, v8
+; RV32V-NEXT:    addi a3, sp, 240
+; RV32V-NEXT:    vlse64.v v20, (a3), zero
+; RV32V-NEXT:    vor.vv v30, v6, v30
+; RV32V-NEXT:    vand.vx v22, v10, a4
+; RV32V-NEXT:    vsll.vi v22, v22, 24
+; RV32V-NEXT:    vand.vv v10, v10, v4
+; RV32V-NEXT:    vsll.vi v10, v10, 8
+; RV32V-NEXT:    vor.vv v10, v22, v10
+; RV32V-NEXT:    addi a3, sp, 232
+; RV32V-NEXT:    vlse64.v v22, (a3), zero
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    addi a3, sp, 224
+; RV32V-NEXT:    vlse64.v v24, (a3), zero
+; RV32V-NEXT:    vor.vv v10, v26, v10
+; RV32V-NEXT:    addi a3, sp, 216
+; RV32V-NEXT:    vlse64.v v26, (a3), zero
+; RV32V-NEXT:    vor.vv v8, v8, v28
+; RV32V-NEXT:    addi a3, sp, 208
+; RV32V-NEXT:    vlse64.v v28, (a3), zero
+; RV32V-NEXT:    vor.vv v10, v10, v30
+; RV32V-NEXT:    vsrl.vi v30, v8, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v8, v2
+; RV32V-NEXT:    vand.vv v30, v30, v2
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v30, v8
+; RV32V-NEXT:    vsrl.vi v30, v10, 4
+; RV32V-NEXT:    vand.vv v10, v10, v2
+; RV32V-NEXT:    vand.vv v30, v30, v2
+; RV32V-NEXT:    vsll.vi v10, v10, 4
+; RV32V-NEXT:    vor.vv v10, v30, v10
+; RV32V-NEXT:    vsrl.vi v30, v8, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v0, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v30, v30, v0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v30, v8
+; RV32V-NEXT:    vsrl.vi v30, v10, 2
+; RV32V-NEXT:    vand.vv v10, v10, v0
+; RV32V-NEXT:    vand.vv v30, v30, v0
+; RV32V-NEXT:    vsll.vi v10, v10, 2
+; RV32V-NEXT:    vor.vv v30, v30, v10
+; RV32V-NEXT:    vsrl.vi v10, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v6, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v6
+; RV32V-NEXT:    vand.vv v10, v10, v6
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v10, v10, v8
+; RV32V-NEXT:    vsrl.vi v8, v30, 1
+; RV32V-NEXT:    vand.vv v30, v30, v6
+; RV32V-NEXT:    vand.vv v8, v8, v6
+; RV32V-NEXT:    vadd.vv v30, v30, v30
+; RV32V-NEXT:    vor.vv v8, v8, v30
+; RV32V-NEXT:    addi a3, sp, 200
+; RV32V-NEXT:    vlse64.v v30, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v22
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v26
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a3, sp, 192
+; RV32V-NEXT:    addi a1, sp, 184
+; RV32V-NEXT:    addi a0, sp, 176
+; RV32V-NEXT:    vlse64.v v12, (a3), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a0), zero
+; RV32V-NEXT:    vand.vv v18, v8, v30
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v18, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a2, sp, 168
+; RV32V-NEXT:    addi a1, sp, 160
+; RV32V-NEXT:    addi a3, sp, 152
+; RV32V-NEXT:    addi a0, sp, 144
+; RV32V-NEXT:    vlse64.v v12, (a2), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a3), zero
+; RV32V-NEXT:    vlse64.v v18, (a0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 136
+; RV32V-NEXT:    addi a1, sp, 128
+; RV32V-NEXT:    addi a2, sp, 120
+; RV32V-NEXT:    addi a3, sp, 112
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 104
+; RV32V-NEXT:    addi a1, sp, 96
+; RV32V-NEXT:    addi a2, sp, 88
+; RV32V-NEXT:    addi a3, sp, 80
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 72
+; RV32V-NEXT:    addi a1, sp, 64
+; RV32V-NEXT:    addi a2, sp, 56
+; RV32V-NEXT:    addi a3, sp, 48
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 40
+; RV32V-NEXT:    addi a1, sp, 32
+; RV32V-NEXT:    addi a2, sp, 24
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vx v12, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vand.vx v18, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v22, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vand.vx v26, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vand.vx v28, v8, a0
+; RV32V-NEXT:    vand.vx v30, v8, a5
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vand.vx v6, v8, a0
+; RV32V-NEXT:    vand.vx v4, v8, a6
+; RV32V-NEXT:    vand.vx v2, v8, t0
+; RV32V-NEXT:    vand.vx v0, v8, t2
+; RV32V-NEXT:    vand.vx v12, v8, t4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s11
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, ra
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, a7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vi v12, v8, 2
+; RV32V-NEXT:    vand.vi v14, v8, 1
+; RV32V-NEXT:    vand.vi v16, v8, 4
+; RV32V-NEXT:    vand.vi v8, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v12, v10, v14
+; RV32V-NEXT:    vmul.vv v14, v10, v16
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v10, v8
+; RV32V-NEXT:    vmul.vv v18, v10, v18
+; RV32V-NEXT:    vmul.vv v20, v10, v20
+; RV32V-NEXT:    vmul.vv v22, v10, v22
+; RV32V-NEXT:    vmul.vv v24, v10, v24
+; RV32V-NEXT:    vmul.vv v26, v10, v26
+; RV32V-NEXT:    vmul.vv v28, v10, v28
+; RV32V-NEXT:    vmul.vv v30, v10, v30
+; RV32V-NEXT:    vmul.vv v6, v10, v6
+; RV32V-NEXT:    vmul.vv v4, v10, v4
+; RV32V-NEXT:    vmul.vv v2, v10, v2
+; RV32V-NEXT:    vmul.vv v0, v10, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v10, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v18
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v22
+; RV32V-NEXT:    vxor.vv v12, v8, v24
+; RV32V-NEXT:    vxor.vv v12, v12, v26
+; RV32V-NEXT:    vxor.vv v12, v12, v28
+; RV32V-NEXT:    vxor.vv v12, v12, v30
+; RV32V-NEXT:    vxor.vv v12, v12, v6
+; RV32V-NEXT:    vxor.vv v12, v12, v4
+; RV32V-NEXT:    vxor.vv v12, v12, v2
+; RV32V-NEXT:    vxor.vv v12, v12, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v22
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v22
+; RV32V-NEXT:    vand.vx v22, v14, a4
+; RV32V-NEXT:    vsrl.vi v16, v16, 24
+; RV32V-NEXT:    vand.vx v16, v16, a4
+; RV32V-NEXT:    vand.vx v12, v12, s10
+; RV32V-NEXT:    vsll.vx v12, v12, s9
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v24, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vsrl.vx v24, v18, s9
+; RV32V-NEXT:    vand.vx v24, v24, s10
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    vxor.vv v10, v20, v10
+; RV32V-NEXT:    vsll.vx v8, v8, t3
+; RV32V-NEXT:    vsrl.vx v10, v10, t3
+; RV32V-NEXT:    vor.vv v8, v8, v12
+; RV32V-NEXT:    vsrl.vi v12, v14, 8
+; RV32V-NEXT:    vsll.vi v14, v22, 24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v12, v12, v20
+; RV32V-NEXT:    vor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vv v16, v18, v20
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    vor.vv v14, v14, v16
+; RV32V-NEXT:    vor.vv v8, v8, v14
+; RV32V-NEXT:    vor.vv v10, v24, v10
+; RV32V-NEXT:    vor.vv v10, v12, v10
+; RV32V-NEXT:    vor.vv v8, v8, v10
+; RV32V-NEXT:    vsrl.vi v10, v8, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v10, v8, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v10, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv2i64_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -320
+; RV64V-NEXT:    sd ra, 312(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 304(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s1, 296(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s2, 288(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s3, 280(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s4, 272(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s5, 264(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s6, 256(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s7, 248(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s8, 240(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 232(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s10, 224(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s11, 216(sp) # 8-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s7
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    lui a1, 16
+; RV64V-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64V-NEXT:    vsrl.vi v16, v8, 24
+; RV64V-NEXT:    vsrl.vi v12, v8, 8
+; RV64V-NEXT:    li a4, 255
+; RV64V-NEXT:    lui a0, 61681
+; RV64V-NEXT:    lui a2, 209715
+; RV64V-NEXT:    lui a3, 349525
+; RV64V-NEXT:    vsrl.vi v18, v10, 24
+; RV64V-NEXT:    vsrl.vi v14, v10, 8
+; RV64V-NEXT:    li a5, 16
+; RV64V-NEXT:    li a6, 32
+; RV64V-NEXT:    li a7, 64
+; RV64V-NEXT:    li t0, 128
+; RV64V-NEXT:    li t2, 256
+; RV64V-NEXT:    li t3, 512
+; RV64V-NEXT:    li t4, 1024
+; RV64V-NEXT:    li t1, 1
+; RV64V-NEXT:    lui s4, 1
+; RV64V-NEXT:    lui s3, 2
+; RV64V-NEXT:    lui s5, 4
+; RV64V-NEXT:    lui t5, 8
+; RV64V-NEXT:    lui s0, 32
+; RV64V-NEXT:    li s6, 56
+; RV64V-NEXT:    vsrl.vx v30, v8, s6
+; RV64V-NEXT:    li s7, 40
+; RV64V-NEXT:    vsrl.vx v0, v8, s7
+; RV64V-NEXT:    addi s9, a1, -256
+; RV64V-NEXT:    lui s8, 4080
+; RV64V-NEXT:    vand.vx v2, v16, s8
+; RV64V-NEXT:    slli a4, a4, 24
+; RV64V-NEXT:    vand.vx v4, v8, s8
+; RV64V-NEXT:    vsll.vx v6, v8, s6
+; RV64V-NEXT:    addi t6, a0, -241
+; RV64V-NEXT:    addi s1, a2, 819
+; RV64V-NEXT:    addi s2, a3, 1365
+; RV64V-NEXT:    vsrl.vx v20, v10, s6
+; RV64V-NEXT:    vsrl.vx v22, v10, s7
+; RV64V-NEXT:    vand.vx v18, v18, s8
+; RV64V-NEXT:    vand.vx v24, v10, s8
+; RV64V-NEXT:    vsll.vx v16, v10, s6
+; RV64V-NEXT:    slli s11, t1, 11
+; RV64V-NEXT:    slli a0, t1, 31
+; RV64V-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t1, 32
+; RV64V-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t1, 33
+; RV64V-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t1, 34
+; RV64V-NEXT:    sd a0, 128(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t6, 32
+; RV64V-NEXT:    add t6, t6, a0
+; RV64V-NEXT:    slli a0, s1, 32
+; RV64V-NEXT:    add s1, s1, a0
+; RV64V-NEXT:    slli a0, s2, 32
+; RV64V-NEXT:    add s2, s2, a0
+; RV64V-NEXT:    slli a0, t1, 35
+; RV64V-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v22, v22, s9
+; RV64V-NEXT:    vand.vx v14, v14, a4
+; RV64V-NEXT:    vsll.vi v24, v24, 24
+; RV64V-NEXT:    vand.vx v26, v10, a4
+; RV64V-NEXT:    vand.vx v10, v10, s9
+; RV64V-NEXT:    vor.vv v20, v22, v20
+; RV64V-NEXT:    vor.vv v14, v14, v18
+; RV64V-NEXT:    vsll.vi v18, v26, 8
+; RV64V-NEXT:    li a0, 40
+; RV64V-NEXT:    vsll.vx v10, v10, a0
+; RV64V-NEXT:    vor.vv v14, v14, v20
+; RV64V-NEXT:    vor.vv v18, v24, v18
+; RV64V-NEXT:    vor.vv v10, v16, v10
+; RV64V-NEXT:    vor.vv v10, v10, v18
+; RV64V-NEXT:    vor.vv v10, v10, v14
+; RV64V-NEXT:    vsrl.vi v14, v10, 4
+; RV64V-NEXT:    vand.vx v10, v10, t6
+; RV64V-NEXT:    vand.vx v14, v14, t6
+; RV64V-NEXT:    vsll.vi v10, v10, 4
+; RV64V-NEXT:    vor.vv v10, v14, v10
+; RV64V-NEXT:    vsrl.vi v14, v10, 2
+; RV64V-NEXT:    vand.vx v10, v10, s1
+; RV64V-NEXT:    vand.vx v14, v14, s1
+; RV64V-NEXT:    vsll.vi v10, v10, 2
+; RV64V-NEXT:    vor.vv v10, v14, v10
+; RV64V-NEXT:    vsrl.vi v14, v10, 1
+; RV64V-NEXT:    vand.vx v10, v10, s2
+; RV64V-NEXT:    vand.vx v14, v14, s2
+; RV64V-NEXT:    vadd.vv v10, v10, v10
+; RV64V-NEXT:    vor.vv v14, v14, v10
+; RV64V-NEXT:    vand.vx v20, v14, a5
+; RV64V-NEXT:    slli a2, t1, 36
+; RV64V-NEXT:    sd a2, 112(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v22, v14, a6
+; RV64V-NEXT:    slli a2, t1, 37
+; RV64V-NEXT:    sd a2, 104(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v24, v14, a7
+; RV64V-NEXT:    slli a2, t1, 38
+; RV64V-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v26, v14, t0
+; RV64V-NEXT:    slli a2, t1, 39
+; RV64V-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v28, v14, t2
+; RV64V-NEXT:    slli a2, t1, 40
+; RV64V-NEXT:    sd a2, 80(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t3
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 7
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 208
+; RV64V-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    slli a2, t1, 41
+; RV64V-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t4
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 208
+; RV64V-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    slli a2, t1, 42
+; RV64V-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 208
+; RV64V-NEXT:    vs2r.v v10, (a2) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    slli a2, t1, 43
+; RV64V-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 160(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v0, s9
+; RV64V-NEXT:    vor.vv v10, v10, v30
+; RV64V-NEXT:    vand.vx v0, v14, s4
+; RV64V-NEXT:    slli a2, t1, 44
+; RV64V-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd a4, 168(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v12, a4
+; RV64V-NEXT:    vor.vv v12, v12, v2
+; RV64V-NEXT:    vand.vx v2, v14, s3
+; RV64V-NEXT:    slli a2, t1, 45
+; RV64V-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vsll.vi v16, v4, 24
+; RV64V-NEXT:    vor.vv v10, v12, v10
+; RV64V-NEXT:    vand.vx v12, v8, a4
+; RV64V-NEXT:    vsll.vi v12, v12, 8
+; RV64V-NEXT:    vor.vv v12, v16, v12
+; RV64V-NEXT:    vand.vx v16, v14, s5
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 208
+; RV64V-NEXT:    vs2r.v v16, (a2) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    slli a2, t1, 46
+; RV64V-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, s9
+; RV64V-NEXT:    vsll.vx v8, v8, a0
+; RV64V-NEXT:    vor.vv v8, v6, v8
+; RV64V-NEXT:    vand.vx v16, v14, t5
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    mv a2, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    add a2, a2, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    add a2, a2, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, a0, a2
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v16, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    slli a0, t1, 47
+; RV64V-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vor.vv v8, v8, v12
+; RV64V-NEXT:    vand.vx v12, v14, a1
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    slli a0, t1, 48
+; RV64V-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vor.vv v8, v8, v10
+; RV64V-NEXT:    vsrl.vi v10, v8, 4
+; RV64V-NEXT:    sd t6, 176(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, t6
+; RV64V-NEXT:    vand.vx v10, v10, t6
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 2
+; RV64V-NEXT:    sd s1, 184(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, s1
+; RV64V-NEXT:    vand.vx v10, v10, s1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 1
+; RV64V-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, s2
+; RV64V-NEXT:    vand.vx v10, v10, s2
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v10, v14, s0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    slli ra, t1, 49
+; RV64V-NEXT:    slli s11, t1, 50
+; RV64V-NEXT:    slli s10, t1, 51
+; RV64V-NEXT:    slli s9, t1, 52
+; RV64V-NEXT:    slli s8, t1, 53
+; RV64V-NEXT:    slli s7, t1, 54
+; RV64V-NEXT:    slli s6, t1, 55
+; RV64V-NEXT:    slli s5, t1, 56
+; RV64V-NEXT:    slli s4, t1, 57
+; RV64V-NEXT:    slli s3, t1, 58
+; RV64V-NEXT:    slli s2, t1, 59
+; RV64V-NEXT:    slli t6, t1, 60
+; RV64V-NEXT:    slli s1, t1, 61
+; RV64V-NEXT:    slli s0, t1, 62
+; RV64V-NEXT:    li a0, -1
+; RV64V-NEXT:    slli t5, a0, 63
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    lui a1, 128
+; RV64V-NEXT:    lui a2, 256
+; RV64V-NEXT:    lui a3, 512
+; RV64V-NEXT:    lui a4, 1024
+; RV64V-NEXT:    lui a5, 2048
+; RV64V-NEXT:    lui a6, 4096
+; RV64V-NEXT:    lui a7, 8192
+; RV64V-NEXT:    lui t0, 16384
+; RV64V-NEXT:    lui t1, 32768
+; RV64V-NEXT:    lui t2, 65536
+; RV64V-NEXT:    lui t3, 131072
+; RV64V-NEXT:    lui t4, 262144
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    sd s6, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    mv s6, a0
 ; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    add s6, s6, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    add s6, s6, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, a0, s6
+; RV64V-NEXT:    ld s6, 8(sp) # 8-byte Folded Reload
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, a1
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 8
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, a2
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, a3
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, ra
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, a4
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s10
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, a5
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s8
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, a6
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, a7
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vi v24, v0, 1
-; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t1
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v0, a0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s11
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t2
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vand.vx v16, v0, s9
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t3
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t4
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v16, v8, v24
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v24, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v0, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44046,247 +16195,241 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v10, v14, a0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, ra
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s11
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s9
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s8
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s7
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s6
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s5
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s4
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s3
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, s2
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v14, t6
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vi v10, v14, 2
+; RV64V-NEXT:    vand.vi v12, v14, 1
+; RV64V-NEXT:    vand.vi v16, v14, 4
+; RV64V-NEXT:    vand.vi v18, v14, 8
+; RV64V-NEXT:    vand.vx v30, v14, s1
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v30, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v30, v14, s0
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v30, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vand.vx v14, v14, t5
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v10, v8, v12
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 9
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v12, v8, v16
+; RV64V-NEXT:    vmul.vv v14, v8, v18
+; RV64V-NEXT:    vmul.vv v16, v8, v20
+; RV64V-NEXT:    vmul.vv v18, v8, v22
+; RV64V-NEXT:    vmul.vv v20, v8, v24
+; RV64V-NEXT:    vmul.vv v22, v8, v26
+; RV64V-NEXT:    vmul.vv v24, v8, v28
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 7
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v26, v8, v26
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44299,16 +16442,11 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v28, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v28, v8, v28
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44319,13 +16457,13 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    addi a0, sp, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v30, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v30, v8, v30
+; RV64V-NEXT:    vmul.vv v6, v8, v0
+; RV64V-NEXT:    vmul.vv v4, v8, v2
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44336,21 +16474,11 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v2, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v2, v8, v2
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44359,21 +16487,11 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v0, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v0
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44384,19 +16502,19 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44405,21 +16523,16 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44428,38 +16541,40 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44470,45 +16585,37 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 9
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44517,144 +16624,108 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    addi a0, sp, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44665,21 +16736,21 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44688,25 +16759,21 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44715,46 +16782,44 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44763,84 +16828,88 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -44849,165 +16918,155 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -45018,19 +17077,19 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -45039,23 +17098,19 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
@@ -45064,1200 +17119,6066 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v16, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v8, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v16, v8
-; RV64V-NEXT:    vxor.vv v8, v8, v24
-; RV64V-NEXT:    vxor.vv v8, v8, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 6
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v8, v24
-; RV64V-NEXT:    addi a0, sp, 304
-; RV64V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    li a0, 56
-; RV64V-NEXT:    vsll.vx v16, v16, a0
-; RV64V-NEXT:    ld a1, 280(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    li a2, 40
-; RV64V-NEXT:    vsll.vx v8, v8, a2
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v24, v8
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 5
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 5
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 5
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 6
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 9
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 5
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 6
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v24, v8
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 5
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 7
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v0, v8, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v0, v0, v16
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v0, v16
-; RV64V-NEXT:    vsrl.vi v0, v24, 8
-; RV64V-NEXT:    ld a4, 288(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v0, v0, a4
-; RV64V-NEXT:    vsrl.vi v8, v8, 24
-; RV64V-NEXT:    lui a3, 4080
-; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    vor.vv v8, v0, v8
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v8, v10
+; RV64V-NEXT:    vxor.vv v10, v10, v12
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vxor.vv v10, v10, v16
+; RV64V-NEXT:    vxor.vv v10, v10, v18
+; RV64V-NEXT:    vxor.vv v10, v10, v20
+; RV64V-NEXT:    vxor.vv v10, v10, v22
+; RV64V-NEXT:    vxor.vv v12, v10, v24
+; RV64V-NEXT:    vxor.vv v12, v12, v26
+; RV64V-NEXT:    vxor.vv v12, v12, v28
+; RV64V-NEXT:    vxor.vv v12, v12, v30
+; RV64V-NEXT:    vxor.vv v12, v12, v6
+; RV64V-NEXT:    vxor.vv v12, v12, v4
+; RV64V-NEXT:    vxor.vv v12, v12, v2
+; RV64V-NEXT:    vxor.vv v12, v12, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v12, v8
+; RV64V-NEXT:    addi a0, sp, 208
+; RV64V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v8
+; RV64V-NEXT:    li a0, 56
+; RV64V-NEXT:    vsll.vx v10, v10, a0
+; RV64V-NEXT:    ld a2, 160(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v12, a2
+; RV64V-NEXT:    li a4, 40
+; RV64V-NEXT:    vsll.vx v12, v12, a4
+; RV64V-NEXT:    vor.vv v10, v10, v12
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v12, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 5
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v14, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v12, v14
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 5
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 6
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v14, v16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v18
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v18
+; RV64V-NEXT:    vsrl.vi v18, v12, 8
+; RV64V-NEXT:    ld a3, 168(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v18, v18, a3
+; RV64V-NEXT:    vsrl.vi v14, v14, 24
+; RV64V-NEXT:    lui a1, 4080
+; RV64V-NEXT:    vand.vx v14, v14, a1
+; RV64V-NEXT:    vor.vv v14, v18, v14
 ; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    mv a6, a5
 ; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 4
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 3
 ; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    slli a5, a5, 3
 ; RV64V-NEXT:    add a6, a6, a5
 ; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    add a5, a5, a6
 ; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    addi a5, a5, 208
+; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v18
 ; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 5
+; RV64V-NEXT:    slli a5, a5, 4
 ; RV64V-NEXT:    mv a6, a5
 ; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    add a5, a5, a6
 ; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    addi a5, a5, 208
+; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v18
 ; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    mv a6, a5
 ; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
 ; RV64V-NEXT:    slli a5, a5, 2
 ; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    add a5, a5, a6
 ; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    addi a5, a5, 208
+; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v18
 ; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    slli a5, a5, 2
 ; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
 ; RV64V-NEXT:    slli a5, a5, 2
 ; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    add a5, a5, a6
 ; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    addi a5, a5, 208
+; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v18
 ; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    slli a5, a5, 1
 ; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    slli a5, a5, 2
 ; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v8, v8, v16
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 6
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 3
-; RV64V-NEXT:    add a5, a5, a6
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v8, v16
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a5, a5, 3
-; RV64V-NEXT:    mv a6, a5
-; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    slli a5, a5, 2
 ; RV64V-NEXT:    add a6, a6, a5
 ; RV64V-NEXT:    slli a5, a5, 1
-; RV64V-NEXT:    add a6, a6, a5
-; RV64V-NEXT:    slli a5, a5, 4
 ; RV64V-NEXT:    add a5, a5, a6
 ; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 304
-; RV64V-NEXT:    vl8r.v v0, (a5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v16, v16, v0
-; RV64V-NEXT:    vand.vx v24, v24, a3
-; RV64V-NEXT:    vsll.vi v24, v24, 24
-; RV64V-NEXT:    vand.vx v0, v8, a4
-; RV64V-NEXT:    vsll.vi v0, v0, 8
-; RV64V-NEXT:    vor.vv v24, v24, v0
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v0, v16, v0
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v16, v16, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v0, v24
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 7
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    csrr a3, vlenb
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
-; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    addi a5, a5, 208
+; RV64V-NEXT:    vl2r.v v18, (a5) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v18
+; RV64V-NEXT:    vand.vx v12, v12, a1
+; RV64V-NEXT:    vsll.vi v12, v12, 24
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a5, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a5, a5, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a5, a5, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a5, a5, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a5
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v18, v16, v18
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a5, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a5, a5, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a5, a5, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a5
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v20, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v18, v18, v20
+; RV64V-NEXT:    vand.vx v20, v16, a3
+; RV64V-NEXT:    vsll.vi v20, v20, 8
+; RV64V-NEXT:    vor.vv v12, v12, v20
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v20, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v18, v18, v20
+; RV64V-NEXT:    vor.vv v10, v10, v12
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a3, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a3, a3, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a3
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v12, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v18, v12
+; RV64V-NEXT:    vsrl.vx v16, v16, a4
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 7
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v18, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 208
+; RV64V-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vx v8, v8, a0
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vor.vv v8, v14, v8
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 4
+; RV64V-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 2
+; RV64V-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 1
+; RV64V-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 304(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s1, 296(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s2, 288(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s3, 280(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s4, 272(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s5, 264(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s6, 256(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s7, 248(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s8, 240(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s9, 232(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s10, 224(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s11, 216(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    addi sp, sp, 320
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv2i64_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v10
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv2i64_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v10
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 2 x i64> %va to <vscale x 2 x i128>
+  %vb.ext = zext <vscale x 2 x i64> %vb to <vscale x 2 x i128>
+  %clmul = call <vscale x 2 x i128> @llvm.clmul.nxv2i128(<vscale x 2 x i128> %va.ext, <vscale x 2 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 2 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 2 x i128> %res.ext to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @clmulh_nxv2i64_vx(<vscale x 2 x i64> %va, i64 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv2i64_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -368
+; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    add a3, a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a3, a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    sub sp, sp, a2
+; RV32V-NEXT:    sw a0, 16(sp)
+; RV32V-NEXT:    sw a1, 20(sp)
+; RV32V-NEXT:    addi s7, sp, 16
+; RV32V-NEXT:    lui s9, 1044480
+; RV32V-NEXT:    li s1, 1
+; RV32V-NEXT:    li ra, 2
+; RV32V-NEXT:    li s3, 4
+; RV32V-NEXT:    li s5, 8
+; RV32V-NEXT:    li s6, 32
+; RV32V-NEXT:    li s11, 64
+; RV32V-NEXT:    li s4, 128
+; RV32V-NEXT:    li s2, 256
+; RV32V-NEXT:    li s0, 512
+; RV32V-NEXT:    li t6, 1024
+; RV32V-NEXT:    lui t5, 1
+; RV32V-NEXT:    lui s8, 2
+; RV32V-NEXT:    lui t4, 4
+; RV32V-NEXT:    lui t2, 8
+; RV32V-NEXT:    lui t1, 16
+; RV32V-NEXT:    lui t0, 32
+; RV32V-NEXT:    lui a7, 64
+; RV32V-NEXT:    lui a6, 128
+; RV32V-NEXT:    lui a5, 256
+; RV32V-NEXT:    lui t3, 512
+; RV32V-NEXT:    lui a4, 1024
+; RV32V-NEXT:    lui a3, 2048
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    lui a1, 8192
+; RV32V-NEXT:    vsetvli s10, zero, e64, m2, ta, ma
+; RV32V-NEXT:    vlse64.v v18, (s7), zero
+; RV32V-NEXT:    lui s10, 16384
+; RV32V-NEXT:    sw s9, 264(sp)
+; RV32V-NEXT:    lui s9, 32768
+; RV32V-NEXT:    sw zero, 268(sp)
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    sw a0, 24(sp)
+; RV32V-NEXT:    sw zero, 28(sp)
+; RV32V-NEXT:    sw zero, 288(sp)
+; RV32V-NEXT:    sw s1, 292(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw ra, 276(sp)
+; RV32V-NEXT:    lui ra, 65536
+; RV32V-NEXT:    sw zero, 280(sp)
+; RV32V-NEXT:    sw s3, 284(sp)
+; RV32V-NEXT:    lui s7, 131072
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s5, 260(sp)
+; RV32V-NEXT:    lui s3, 262144
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    li s5, 16
+; RV32V-NEXT:    sw s5, 252(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw s6, 244(sp)
+; RV32V-NEXT:    li s5, 32
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw s11, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s4, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s2, 220(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s0, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw t6, 204(sp)
+; RV32V-NEXT:    slli s1, s1, 11
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s1, 196(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw t5, 188(sp)
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw s8, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw t4, 172(sp)
+; RV32V-NEXT:    lui s4, 4
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw t2, 164(sp)
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw t1, 156(sp)
+; RV32V-NEXT:    lui s0, 16
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw t0, 148(sp)
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw a7, 140(sp)
+; RV32V-NEXT:    lui t5, 64
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw a6, 132(sp)
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw a5, 124(sp)
+; RV32V-NEXT:    lui t6, 256
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw t3, 116(sp)
+; RV32V-NEXT:    lui a7, 512
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw a4, 108(sp)
+; RV32V-NEXT:    lui t4, 1024
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw a3, 100(sp)
+; RV32V-NEXT:    lui a5, 2048
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw a2, 92(sp)
+; RV32V-NEXT:    lui a6, 4096
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a1, 84(sp)
+; RV32V-NEXT:    lui a4, 8192
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw s10, 76(sp)
+; RV32V-NEXT:    lui t1, 16384
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw s9, 68(sp)
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw ra, 60(sp)
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw s7, 52(sp)
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw s3, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw a0, 36(sp)
+; RV32V-NEXT:    lui t3, 61681
+; RV32V-NEXT:    addi t3, t3, -241
+; RV32V-NEXT:    vsetvli s8, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v4, t3
+; RV32V-NEXT:    lui t3, 209715
+; RV32V-NEXT:    addi t3, t3, 819
+; RV32V-NEXT:    vmv.v.x v2, t3
+; RV32V-NEXT:    lui t3, 349525
+; RV32V-NEXT:    addi t3, t3, 1365
+; RV32V-NEXT:    vmv.v.x v0, t3
+; RV32V-NEXT:    addi t3, sp, 264
+; RV32V-NEXT:    vsetvli s8, zero, e64, m2, ta, ma
+; RV32V-NEXT:    vlse64.v v6, (t3), zero
+; RV32V-NEXT:    addi t3, sp, 24
+; RV32V-NEXT:    vlse64.v v12, (t3), zero
+; RV32V-NEXT:    addi t3, sp, 288
+; RV32V-NEXT:    vlse64.v v14, (t3), zero
+; RV32V-NEXT:    addi t3, sp, 272
+; RV32V-NEXT:    vlse64.v v16, (t3), zero
+; RV32V-NEXT:    li t3, 56
+; RV32V-NEXT:    vsrl.vi v10, v8, 24
+; RV32V-NEXT:    vsrl.vi v20, v8, 8
+; RV32V-NEXT:    vsrl.vx v22, v8, t3
+; RV32V-NEXT:    li s8, 40
+; RV32V-NEXT:    vsrl.vx v24, v8, s8
+; RV32V-NEXT:    lui s11, 4080
+; RV32V-NEXT:    vand.vx v28, v10, s11
+; RV32V-NEXT:    vsll.vx v10, v8, t3
+; RV32V-NEXT:    addi s10, s0, -256
+; RV32V-NEXT:    vand.vx v24, v24, s10
+; RV32V-NEXT:    vand.vx v26, v8, s10
+; RV32V-NEXT:    vor.vv v22, v24, v22
+; RV32V-NEXT:    vsll.vx v24, v26, s8
+; RV32V-NEXT:    vor.vv v10, v10, v24
+; RV32V-NEXT:    vsrl.vx v24, v18, t3
+; RV32V-NEXT:    vsrl.vx v26, v18, s8
+; RV32V-NEXT:    vsll.vx v30, v18, t3
+; RV32V-NEXT:    vand.vx v26, v26, s10
+; RV32V-NEXT:    vor.vv v24, v26, v24
+; RV32V-NEXT:    vand.vx v26, v18, s10
+; RV32V-NEXT:    vsll.vx v26, v26, s8
+; RV32V-NEXT:    vor.vv v26, v30, v26
+; RV32V-NEXT:    vsrl.vi v30, v18, 24
+; RV32V-NEXT:    vand.vv v20, v20, v6
+; RV32V-NEXT:    vor.vv v28, v20, v28
+; RV32V-NEXT:    vsrl.vi v20, v18, 8
+; RV32V-NEXT:    vand.vx v30, v30, s11
+; RV32V-NEXT:    vand.vv v20, v20, v6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v6, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vor.vv v30, v20, v30
+; RV32V-NEXT:    addi a3, sp, 280
+; RV32V-NEXT:    vlse64.v v20, (a3), zero
+; RV32V-NEXT:    vor.vv v28, v28, v22
+; RV32V-NEXT:    vand.vx v22, v8, s11
+; RV32V-NEXT:    vsll.vi v22, v22, 24
+; RV32V-NEXT:    vand.vv v8, v8, v6
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v22, v8
+; RV32V-NEXT:    addi a3, sp, 256
+; RV32V-NEXT:    vlse64.v v22, (a3), zero
+; RV32V-NEXT:    vor.vv v30, v30, v24
+; RV32V-NEXT:    vand.vx v24, v18, s11
+; RV32V-NEXT:    vsll.vi v24, v24, 24
+; RV32V-NEXT:    vand.vv v18, v18, v6
+; RV32V-NEXT:    vsll.vi v18, v18, 8
+; RV32V-NEXT:    vor.vv v6, v24, v18
+; RV32V-NEXT:    addi a3, sp, 248
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    addi a3, sp, 240
+; RV32V-NEXT:    vlse64.v v24, (a3), zero
+; RV32V-NEXT:    vor.vv v10, v26, v6
+; RV32V-NEXT:    addi a3, sp, 232
+; RV32V-NEXT:    vlse64.v v26, (a3), zero
+; RV32V-NEXT:    vor.vv v8, v8, v28
+; RV32V-NEXT:    addi a3, sp, 224
+; RV32V-NEXT:    vlse64.v v28, (a3), zero
+; RV32V-NEXT:    vor.vv v10, v10, v30
+; RV32V-NEXT:    vsrl.vi v30, v8, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v8, v4
+; RV32V-NEXT:    vand.vv v30, v30, v4
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v30, v8
+; RV32V-NEXT:    vsrl.vi v30, v10, 4
+; RV32V-NEXT:    vand.vv v10, v10, v4
+; RV32V-NEXT:    vand.vv v30, v30, v4
+; RV32V-NEXT:    vsll.vi v10, v10, 4
+; RV32V-NEXT:    vor.vv v10, v30, v10
+; RV32V-NEXT:    vsrl.vi v30, v8, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v8, v2
+; RV32V-NEXT:    vand.vv v30, v30, v2
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v30, v8
+; RV32V-NEXT:    vsrl.vi v30, v10, 2
+; RV32V-NEXT:    vand.vv v10, v10, v2
+; RV32V-NEXT:    vand.vv v30, v30, v2
+; RV32V-NEXT:    vsll.vi v10, v10, 2
+; RV32V-NEXT:    vor.vv v30, v30, v10
+; RV32V-NEXT:    vsrl.vi v10, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v0, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v10, v10, v0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v10, v10, v8
+; RV32V-NEXT:    vsrl.vi v8, v30, 1
+; RV32V-NEXT:    vand.vv v30, v30, v0
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vadd.vv v30, v30, v30
+; RV32V-NEXT:    vor.vv v8, v8, v30
+; RV32V-NEXT:    addi a3, sp, 216
+; RV32V-NEXT:    vlse64.v v30, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v22
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v26
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a3, sp, 208
+; RV32V-NEXT:    addi a1, sp, 200
+; RV32V-NEXT:    addi a0, sp, 192
+; RV32V-NEXT:    vlse64.v v12, (a3), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a0), zero
+; RV32V-NEXT:    vand.vv v18, v8, v30
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v18, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a2, sp, 184
+; RV32V-NEXT:    addi a1, sp, 176
+; RV32V-NEXT:    addi a3, sp, 168
+; RV32V-NEXT:    addi a0, sp, 160
+; RV32V-NEXT:    vlse64.v v12, (a2), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a3), zero
+; RV32V-NEXT:    vlse64.v v18, (a0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 152
+; RV32V-NEXT:    addi a1, sp, 144
+; RV32V-NEXT:    addi a2, sp, 136
+; RV32V-NEXT:    addi a3, sp, 128
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 120
+; RV32V-NEXT:    addi a1, sp, 112
+; RV32V-NEXT:    addi a2, sp, 104
+; RV32V-NEXT:    addi a3, sp, 96
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 88
+; RV32V-NEXT:    addi a1, sp, 80
+; RV32V-NEXT:    addi a2, sp, 72
+; RV32V-NEXT:    addi a3, sp, 64
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 56
+; RV32V-NEXT:    addi a1, sp, 48
+; RV32V-NEXT:    addi a2, sp, 40
+; RV32V-NEXT:    addi a3, sp, 32
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v14, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a2), zero
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vx v12, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v18, v8, s5
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v22, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vand.vx v26, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vand.vx v28, v8, a0
+; RV32V-NEXT:    vand.vx v30, v8, s1
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vand.vx v6, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vand.vx v4, v8, a0
+; RV32V-NEXT:    vand.vx v2, v8, s4
+; RV32V-NEXT:    vand.vx v0, v8, t2
+; RV32V-NEXT:    vand.vx v12, v8, s0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, a7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, a5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, a6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, a4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s9
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, ra
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vand.vi v12, v8, 2
+; RV32V-NEXT:    vand.vi v14, v8, 1
+; RV32V-NEXT:    vand.vi v16, v8, 4
+; RV32V-NEXT:    vand.vi v8, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v12, v10, v14
+; RV32V-NEXT:    vmul.vv v14, v10, v16
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v16, v10, v8
+; RV32V-NEXT:    vmul.vv v18, v10, v18
+; RV32V-NEXT:    vmul.vv v20, v10, v20
+; RV32V-NEXT:    vmul.vv v22, v10, v22
+; RV32V-NEXT:    vmul.vv v24, v10, v24
+; RV32V-NEXT:    vmul.vv v26, v10, v26
+; RV32V-NEXT:    vmul.vv v28, v10, v28
+; RV32V-NEXT:    vmul.vv v30, v10, v30
+; RV32V-NEXT:    vmul.vv v6, v10, v6
+; RV32V-NEXT:    vmul.vv v4, v10, v4
+; RV32V-NEXT:    vmul.vv v2, v10, v2
+; RV32V-NEXT:    vmul.vv v0, v10, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    addi a0, sp, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v10, v10, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    vxor.vv v8, v8, v18
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v22
+; RV32V-NEXT:    vxor.vv v12, v8, v24
+; RV32V-NEXT:    vxor.vv v12, v12, v26
+; RV32V-NEXT:    vxor.vv v12, v12, v28
+; RV32V-NEXT:    vxor.vv v12, v12, v30
+; RV32V-NEXT:    vxor.vv v12, v12, v6
+; RV32V-NEXT:    vxor.vv v12, v12, v4
+; RV32V-NEXT:    vxor.vv v12, v12, v2
+; RV32V-NEXT:    vxor.vv v12, v12, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v12, v14
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    addi a0, sp, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v14, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v16, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v14, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v18, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v16, v18
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v18, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v18, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v22
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v22, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v22
+; RV32V-NEXT:    vand.vx v22, v14, s11
+; RV32V-NEXT:    vsrl.vi v16, v16, 24
+; RV32V-NEXT:    vand.vx v16, v16, s11
+; RV32V-NEXT:    vand.vx v12, v12, s10
+; RV32V-NEXT:    vsll.vx v12, v12, s8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v24, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v24
+; RV32V-NEXT:    vsrl.vx v24, v18, s8
+; RV32V-NEXT:    vand.vx v24, v24, s10
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v26
+; RV32V-NEXT:    vxor.vv v10, v20, v10
+; RV32V-NEXT:    vsll.vx v8, v8, t3
+; RV32V-NEXT:    vsrl.vx v10, v10, t3
+; RV32V-NEXT:    vor.vv v8, v8, v12
+; RV32V-NEXT:    vsrl.vi v12, v14, 8
+; RV32V-NEXT:    vsll.vi v14, v22, 24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v20, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v12, v12, v20
+; RV32V-NEXT:    vor.vv v12, v12, v16
+; RV32V-NEXT:    vand.vv v16, v18, v20
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    vor.vv v14, v14, v16
+; RV32V-NEXT:    vor.vv v8, v8, v14
+; RV32V-NEXT:    vor.vv v10, v24, v10
+; RV32V-NEXT:    vor.vv v10, v12, v10
+; RV32V-NEXT:    vor.vv v8, v8, v10
+; RV32V-NEXT:    vsrl.vi v10, v8, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v10, v8, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v10, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v10, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 368
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv2i64_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a1, 56
+; RV64V-NEXT:    lui t2, 16
+; RV64V-NEXT:    lui a2, 4080
+; RV64V-NEXT:    li t0, 255
+; RV64V-NEXT:    lui a3, 61681
+; RV64V-NEXT:    lui a4, 209715
+; RV64V-NEXT:    lui a5, 349525
+; RV64V-NEXT:    srli a6, a0, 24
+; RV64V-NEXT:    srli a7, a0, 8
+; RV64V-NEXT:    srli t1, a0, 40
+; RV64V-NEXT:    srli t3, a0, 56
+; RV64V-NEXT:    addi a3, a3, -241
+; RV64V-NEXT:    addi a4, a4, 819
+; RV64V-NEXT:    addi t4, a5, 1365
+; RV64V-NEXT:    slli a5, a3, 32
+; RV64V-NEXT:    add a5, a3, a5
+; RV64V-NEXT:    slli a3, a4, 32
 ; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
-; RV64V-NEXT:    add a3, a3, a4
-; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 304
-; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vsrl.vx v8, v8, a2
-; RV64V-NEXT:    vand.vx v8, v8, a1
-; RV64V-NEXT:    vsrl.vx v24, v24, a0
-; RV64V-NEXT:    vor.vv v8, v8, v24
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 304
-; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v8, v24, v8
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 4
-; RV64V-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    slli a3, t4, 32
+; RV64V-NEXT:    add a3, t4, a3
+; RV64V-NEXT:    srliw t4, a0, 24
+; RV64V-NEXT:    slli t0, t0, 24
+; RV64V-NEXT:    and a6, a6, a2
+; RV64V-NEXT:    and a7, a7, t0
+; RV64V-NEXT:    or t5, a7, a6
+; RV64V-NEXT:    addi a6, t2, -256
+; RV64V-NEXT:    and a7, t1, a6
+; RV64V-NEXT:    or t1, a7, t3
+; RV64V-NEXT:    and a7, a0, a2
+; RV64V-NEXT:    slli t4, t4, 32
+; RV64V-NEXT:    slli a7, a7, 24
+; RV64V-NEXT:    or t3, a7, t4
+; RV64V-NEXT:    li a7, 40
+; RV64V-NEXT:    vsetvli t4, zero, e64, m2, ta, ma
+; RV64V-NEXT:    vsrl.vi v12, v8, 24
+; RV64V-NEXT:    vsrl.vi v10, v8, 8
+; RV64V-NEXT:    or t1, t5, t1
+; RV64V-NEXT:    slli t4, a0, 56
+; RV64V-NEXT:    and a0, a0, a6
+; RV64V-NEXT:    slli a0, a0, 40
+; RV64V-NEXT:    or t4, t4, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    or t4, t4, t3
+; RV64V-NEXT:    lui t3, 1
+; RV64V-NEXT:    vsrl.vx v14, v8, a1
+; RV64V-NEXT:    vsrl.vx v16, v8, a7
+; RV64V-NEXT:    vand.vx v12, v12, a2
+; RV64V-NEXT:    vand.vx v18, v8, a2
+; RV64V-NEXT:    vsll.vx v20, v8, a1
+; RV64V-NEXT:    vand.vx v16, v16, a6
+; RV64V-NEXT:    vand.vx v10, v10, t0
+; RV64V-NEXT:    vsll.vi v18, v18, 24
+; RV64V-NEXT:    vor.vv v14, v16, v14
+; RV64V-NEXT:    vand.vx v16, v8, t0
+; RV64V-NEXT:    vand.vx v8, v8, a6
+; RV64V-NEXT:    vor.vv v10, v10, v12
+; RV64V-NEXT:    vsll.vi v12, v16, 8
+; RV64V-NEXT:    vsll.vx v8, v8, a7
+; RV64V-NEXT:    vor.vv v10, v10, v14
+; RV64V-NEXT:    vor.vv v12, v18, v12
+; RV64V-NEXT:    vor.vv v8, v20, v8
+; RV64V-NEXT:    vor.vv v8, v8, v12
+; RV64V-NEXT:    vor.vv v8, v8, v10
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v10, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a5
+; RV64V-NEXT:    srli t4, t1, 4
+; RV64V-NEXT:    and t1, t1, a5
+; RV64V-NEXT:    vand.vx v10, v10, a5
 ; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    and t4, t4, a5
+; RV64V-NEXT:    slli t1, t1, 4
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v10, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a4
+; RV64V-NEXT:    srli t4, t1, 2
+; RV64V-NEXT:    and t1, t1, a4
+; RV64V-NEXT:    vand.vx v10, v10, a4
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    and t4, t4, a4
+; RV64V-NEXT:    slli t1, t1, 2
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v10, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    srli t4, t1, 1
+; RV64V-NEXT:    and t1, t1, a3
+; RV64V-NEXT:    vand.vx v10, v10, a3
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    and t4, t4, a3
+; RV64V-NEXT:    slli t1, t1, 1
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    andi t4, t1, 2
+; RV64V-NEXT:    vmul.vx v10, v8, t4
+; RV64V-NEXT:    andi t4, t1, 1
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    andi t4, t1, 4
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    andi t4, t1, 8
+; RV64V-NEXT:    vmul.vx v16, v8, t4
+; RV64V-NEXT:    andi t4, t1, 16
+; RV64V-NEXT:    vmul.vx v18, v8, t4
+; RV64V-NEXT:    andi t4, t1, 32
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    andi t4, t1, 64
+; RV64V-NEXT:    vxor.vv v10, v12, v10
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    andi t4, t1, 128
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    andi t4, t1, 256
+; RV64V-NEXT:    vxor.vv v10, v10, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t4
+; RV64V-NEXT:    andi t4, t1, 512
+; RV64V-NEXT:    vxor.vv v10, v10, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t4
+; RV64V-NEXT:    andi t4, t1, 1024
+; RV64V-NEXT:    vxor.vv v10, v10, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    slli t4, a0, 11
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v10, v10, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    lui t4, 2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v10, v10, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    lui t3, 4
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v10, v16
+; RV64V-NEXT:    vxor.vv v16, v16, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t4
+; RV64V-NEXT:    lui t4, 8
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t3
+; RV64V-NEXT:    lui t3, 32
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v16, v12
+; RV64V-NEXT:    vmul.vx v16, v8, t4
+; RV64V-NEXT:    lui t4, 64
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t2
+; RV64V-NEXT:    lui t2, 128
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t3
+; RV64V-NEXT:    lui t3, 256
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    lui t4, 512
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t2
+; RV64V-NEXT:    lui t2, 1024
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    lui t3, 2048
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t4
+; RV64V-NEXT:    lui t4, 4096
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t2
+; RV64V-NEXT:    lui t2, 8192
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t3
+; RV64V-NEXT:    lui t3, 16384
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v22, v8, t4
+; RV64V-NEXT:    lui t4, 32768
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t2
+; RV64V-NEXT:    lui t2, 65536
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v14, v12, v20
+; RV64V-NEXT:    vmul.vx v12, v8, t3
+; RV64V-NEXT:    lui t3, 131072
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v20, v14, v16
+; RV64V-NEXT:    vmul.vx v14, v8, t4
+; RV64V-NEXT:    lui t4, 262144
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v16, v20, v22
+; RV64V-NEXT:    vxor.vv v18, v16, v18
+; RV64V-NEXT:    vmul.vx v16, v8, t2
+; RV64V-NEXT:    slli t2, a0, 32
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vsll.vx v10, v10, a1
+; RV64V-NEXT:    vand.vx v20, v20, a6
+; RV64V-NEXT:    vsll.vx v20, v20, a7
+; RV64V-NEXT:    vor.vv v10, v10, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t3
+; RV64V-NEXT:    slli t3, a0, 33
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v18, v12
+; RV64V-NEXT:    vmul.vx v18, v8, t4
+; RV64V-NEXT:    slli t4, a0, 34
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t2
+; RV64V-NEXT:    slli t2, a0, 35
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t3
+; RV64V-NEXT:    slli t3, a0, 36
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v20, v12, v20
+; RV64V-NEXT:    vmul.vx v12, v8, t4
+; RV64V-NEXT:    srliw t4, t1, 31
+; RV64V-NEXT:    slli t4, t4, 31
+; RV64V-NEXT:    vxor.vv v18, v20, v18
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    slli t4, a0, 37
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v18, v18, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t2
+; RV64V-NEXT:    slli t2, a0, 38
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v14, v18, v14
+; RV64V-NEXT:    vmul.vx v18, v8, t3
+; RV64V-NEXT:    slli t3, a0, 39
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t4
+; RV64V-NEXT:    slli t4, a0, 40
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vmul.vx v14, v8, t2
+; RV64V-NEXT:    slli t2, a0, 41
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t3
+; RV64V-NEXT:    slli t3, a0, 42
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t4
+; RV64V-NEXT:    slli t4, a0, 43
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v12, v12, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t2
+; RV64V-NEXT:    slli t2, a0, 44
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vx v14, v8, t3
+; RV64V-NEXT:    slli t3, a0, 45
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    slli t4, a0, 46
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v18, v12, v18
+; RV64V-NEXT:    vxor.vv v16, v18, v16
+; RV64V-NEXT:    vmul.vx v18, v8, t2
+; RV64V-NEXT:    slli t2, a0, 47
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v14, v16, v14
+; RV64V-NEXT:    vmul.vx v16, v8, t3
+; RV64V-NEXT:    slli t3, a0, 48
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v14, v14, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    slli t4, a0, 49
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v14, v14, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t2
+; RV64V-NEXT:    slli t2, a0, 50
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v14, v14, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t3
+; RV64V-NEXT:    slli t3, a0, 51
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v14, v14, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    slli t4, a0, 52
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v14, v14, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t2
+; RV64V-NEXT:    slli t2, a0, 53
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v14, v16
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t3
+; RV64V-NEXT:    slli t3, a0, 54
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v18
+; RV64V-NEXT:    vsrl.vi v18, v12, 8
+; RV64V-NEXT:    vand.vx v18, v18, t0
+; RV64V-NEXT:    vsrl.vi v14, v14, 24
+; RV64V-NEXT:    vand.vx v14, v14, a2
+; RV64V-NEXT:    vor.vv v14, v18, v14
+; RV64V-NEXT:    vmul.vx v18, v8, t4
+; RV64V-NEXT:    slli t4, a0, 55
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t2
+; RV64V-NEXT:    slli t2, a0, 56
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v16, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t3
+; RV64V-NEXT:    slli t3, a0, 57
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    slli t4, a0, 58
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v18
+; RV64V-NEXT:    vmul.vx v18, v8, t2
+; RV64V-NEXT:    slli t2, a0, 59
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t3
+; RV64V-NEXT:    slli t3, a0, 60
+; RV64V-NEXT:    vand.vx v12, v12, a2
+; RV64V-NEXT:    slli a2, a0, 61
+; RV64V-NEXT:    slli a0, a0, 62
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    and a2, t1, a2
+; RV64V-NEXT:    and a0, t1, a0
+; RV64V-NEXT:    srli t1, t1, 63
+; RV64V-NEXT:    vsll.vi v12, v12, 24
+; RV64V-NEXT:    vxor.vv v18, v16, v18
+; RV64V-NEXT:    vxor.vv v18, v18, v20
+; RV64V-NEXT:    vand.vx v20, v16, t0
+; RV64V-NEXT:    vsll.vi v20, v20, 8
+; RV64V-NEXT:    vor.vv v12, v12, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    vxor.vv v18, v18, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t2
+; RV64V-NEXT:    vor.vv v10, v10, v12
+; RV64V-NEXT:    vmul.vx v12, v8, t3
+; RV64V-NEXT:    vxor.vv v18, v18, v20
+; RV64V-NEXT:    vmul.vx v20, v8, a2
+; RV64V-NEXT:    vxor.vv v12, v18, v12
+; RV64V-NEXT:    vmul.vx v18, v8, a0
+; RV64V-NEXT:    slli t1, t1, 63
+; RV64V-NEXT:    vmul.vx v8, v8, t1
+; RV64V-NEXT:    vsrl.vx v16, v16, a7
+; RV64V-NEXT:    vand.vx v16, v16, a6
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vxor.vv v12, v12, v18
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vx v8, v8, a1
 ; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 2
-; RV64V-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vor.vv v8, v14, v8
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a5
+; RV64V-NEXT:    vand.vx v10, v10, a5
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a4
+; RV64V-NEXT:    vand.vx v10, v10, a4
 ; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 1
-; RV64V-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v8, a0
-; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vsrl.vi v10, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vand.vx v10, v10, a3
 ; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vor.vv v8, v10, v8
 ; RV64V-NEXT:    vsrl.vi v8, v8, 1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add sp, sp, a0
-; RV64V-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s0, 400(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s1, 392(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s2, 384(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s3, 376(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s4, 368(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s5, 360(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s6, 352(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s7, 344(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s8, 336(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s9, 328(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s10, 320(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s11, 312(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    addi sp, sp, 416
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i64_vv:
+; RV32ZVBC-LABEL: clmulh_nxv2i64_vx:
 ; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v16
+; RV32ZVBC-NEXT:    addi sp, sp, -16
+; RV32ZVBC-NEXT:    sw a0, 8(sp)
+; RV32ZVBC-NEXT:    sw a1, 12(sp)
+; RV32ZVBC-NEXT:    addi a0, sp, 8
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vlse64.v v10, (a0), zero
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v10
+; RV32ZVBC-NEXT:    addi sp, sp, 16
 ; RV32ZVBC-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i64_vv:
+; RV64ZVBC-LABEL: clmulh_nxv2i64_vx:
 ; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v16
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC-NEXT:    ret
-  %va.ext = zext <vscale x 8 x i64> %va to <vscale x 8 x i128>
-  %vb.ext = zext <vscale x 8 x i64> %vb to <vscale x 8 x i128>
-  %clmul = call <vscale x 8 x i128> @llvm.clmul.nxv8i128(<vscale x 8 x i128> %va.ext, <vscale x 8 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 8 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 8 x i128> %res.ext to <vscale x 8 x i64>
-  ret <vscale x 8 x i64> %res
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i128 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %va.ext = zext <vscale x 2 x i64> %va to <vscale x 2 x i128>
+  %vb.ext = zext <vscale x 2 x i64> %vb to <vscale x 2 x i128>
+  %clmul = call <vscale x 2 x i128> @llvm.clmul.nxv2i128(<vscale x 2 x i128> %va.ext, <vscale x 2 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 2 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 2 x i128> %res.ext to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nounwind {
-; RV32V-LABEL: clmulh_nxv8i64_vx:
+define <vscale x 4 x i64> @clmulh_nxv4i64_vv(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv4i64_vv:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi sp, sp, -368
-; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 3
-; RV32V-NEXT:    mv a3, a2
-; RV32V-NEXT:    slli a2, a2, 1
-; RV32V-NEXT:    add a3, a3, a2
-; RV32V-NEXT:    slli a2, a2, 5
-; RV32V-NEXT:    add a2, a2, a3
-; RV32V-NEXT:    sub sp, sp, a2
-; RV32V-NEXT:    csrr a2, vlenb
-; RV32V-NEXT:    slli a2, a2, 9
-; RV32V-NEXT:    add a2, sp, a2
-; RV32V-NEXT:    addi a2, a2, 304
-; RV32V-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    sw a0, 16(sp)
-; RV32V-NEXT:    sw a1, 20(sp)
-; RV32V-NEXT:    addi a2, sp, 16
-; RV32V-NEXT:    lui t6, 16
-; RV32V-NEXT:    li t5, 56
-; RV32V-NEXT:    li t4, 40
-; RV32V-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vlse64.v v24, (a2), zero
-; RV32V-NEXT:    vsrl.vx v16, v8, t5
-; RV32V-NEXT:    vsrl.vx v0, v8, t4
-; RV32V-NEXT:    addi t3, t6, -256
-; RV32V-NEXT:    vand.vx v0, v0, t3
-; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    addi sp, sp, -352
+; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 336(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 312(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vmv4r.v v28, v12
+; RV32V-NEXT:    lui s11, 1044480
+; RV32V-NEXT:    lui t6, 524288
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    li ra, 2
+; RV32V-NEXT:    li t4, 4
+; RV32V-NEXT:    li t2, 8
+; RV32V-NEXT:    li t5, 16
+; RV32V-NEXT:    li t3, 32
+; RV32V-NEXT:    li t1, 64
+; RV32V-NEXT:    li t0, 128
+; RV32V-NEXT:    li a7, 256
+; RV32V-NEXT:    li a6, 512
+; RV32V-NEXT:    li a3, 1024
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    lui a4, 2
+; RV32V-NEXT:    lui a1, 4
+; RV32V-NEXT:    lui a5, 8
+; RV32V-NEXT:    lui s0, 16
+; RV32V-NEXT:    lui s1, 32
+; RV32V-NEXT:    lui s2, 64
+; RV32V-NEXT:    lui s3, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    lui s5, 512
+; RV32V-NEXT:    lui s6, 1024
+; RV32V-NEXT:    lui s7, 2048
+; RV32V-NEXT:    lui s8, 4096
+; RV32V-NEXT:    lui s9, 8192
+; RV32V-NEXT:    lui s10, 16384
+; RV32V-NEXT:    sw s11, 248(sp)
+; RV32V-NEXT:    lui s11, 32768
+; RV32V-NEXT:    sw zero, 252(sp)
+; RV32V-NEXT:    sw t6, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw a0, 276(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw ra, 260(sp)
+; RV32V-NEXT:    lui ra, 65536
+; RV32V-NEXT:    sw zero, 264(sp)
+; RV32V-NEXT:    sw t4, 268(sp)
+; RV32V-NEXT:    lui t4, 131072
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw t2, 244(sp)
+; RV32V-NEXT:    lui t2, 262144
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw t5, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw t3, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw t1, 220(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw t0, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw a7, 204(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw a6, 196(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw a3, 188(sp)
+; RV32V-NEXT:    slli a3, a0, 11
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw a3, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw a2, 172(sp)
+; RV32V-NEXT:    lui t1, 1
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw a4, 164(sp)
+; RV32V-NEXT:    lui t3, 2
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw a1, 156(sp)
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw a5, 148(sp)
+; RV32V-NEXT:    lui t5, 8
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw s0, 140(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw s1, 132(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw s2, 124(sp)
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw s3, 116(sp)
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw s4, 108(sp)
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw s5, 100(sp)
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw s6, 92(sp)
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw s7, 84(sp)
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw s8, 76(sp)
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw s9, 68(sp)
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw s10, 60(sp)
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw s11, 52(sp)
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw ra, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw t4, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw t2, 28(sp)
+; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    sw t6, 20(sp)
+; RV32V-NEXT:    addi a1, sp, 248
+; RV32V-NEXT:    vlse64.v v12, (a1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a6, 56
+; RV32V-NEXT:    vsrl.vx v20, v8, a6
+; RV32V-NEXT:    li a5, 40
+; RV32V-NEXT:    vsrl.vx v24, v8, a5
+; RV32V-NEXT:    vsll.vx v16, v8, a6
+; RV32V-NEXT:    vsrl.vx v12, v28, a6
+; RV32V-NEXT:    vsrl.vx v4, v28, a5
+; RV32V-NEXT:    addi a2, s0, -256
+; RV32V-NEXT:    vand.vx v24, v24, a2
+; RV32V-NEXT:    vor.vv v24, v24, v20
+; RV32V-NEXT:    vsll.vx v0, v28, a6
+; RV32V-NEXT:    vand.vx v20, v4, a2
+; RV32V-NEXT:    vor.vv v12, v20, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v20, v8, a2
+; RV32V-NEXT:    vsll.vx v20, v20, a5
+; RV32V-NEXT:    vor.vv v20, v16, v20
+; RV32V-NEXT:    vand.vx v16, v28, a2
+; RV32V-NEXT:    vsll.vx v16, v16, a5
+; RV32V-NEXT:    vor.vv v12, v0, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vsrl.vi v4, v8, 24
+; RV32V-NEXT:    lui a4, 4080
+; RV32V-NEXT:    vand.vx v4, v4, a4
+; RV32V-NEXT:    vsrl.vi v0, v8, 8
+; RV32V-NEXT:    vmv4r.v v12, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v0, v0, v8
+; RV32V-NEXT:    vor.vv v4, v0, v4
+; RV32V-NEXT:    vsrl.vi v0, v28, 24
+; RV32V-NEXT:    vand.vx v0, v0, a4
+; RV32V-NEXT:    vsrl.vi v16, v28, 8
+; RV32V-NEXT:    vand.vv v16, v16, v8
+; RV32V-NEXT:    vor.vv v16, v16, v0
+; RV32V-NEXT:    vor.vv v24, v4, v24
+; RV32V-NEXT:    vand.vx v4, v12, a4
+; RV32V-NEXT:    vsll.vi v4, v4, 24
+; RV32V-NEXT:    vand.vv v12, v12, v8
+; RV32V-NEXT:    vsll.vi v12, v12, 8
+; RV32V-NEXT:    vor.vv v12, v4, v12
+; RV32V-NEXT:    lui a7, 61681
+; RV32V-NEXT:    addi a7, a7, -241
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vor.vv v16, v16, v4
+; RV32V-NEXT:    vmv4r.v v4, v28
+; RV32V-NEXT:    vand.vx v28, v28, a4
+; RV32V-NEXT:    vsll.vi v28, v28, 24
+; RV32V-NEXT:    vand.vv v4, v4, v8
+; RV32V-NEXT:    vsll.vi v4, v4, 8
+; RV32V-NEXT:    vor.vv v28, v28, v4
+; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v4, a7
+; RV32V-NEXT:    lui a7, 209715
+; RV32V-NEXT:    addi a7, a7, 819
+; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vor.vv v12, v20, v12
+; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v0, a7
+; RV32V-NEXT:    lui a7, 349525
+; RV32V-NEXT:    addi a7, a7, 1365
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vor.vv v28, v8, v28
+; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v8, a7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vsetvli a7, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vor.vv v24, v12, v24
+; RV32V-NEXT:    addi a7, sp, 8
+; RV32V-NEXT:    vlse64.v v12, (a7), zero
+; RV32V-NEXT:    vor.vv v8, v28, v16
+; RV32V-NEXT:    vsrl.vi v16, v24, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v20, v24, v4
+; RV32V-NEXT:    vand.vv v16, v16, v4
+; RV32V-NEXT:    vsll.vi v20, v20, 4
+; RV32V-NEXT:    vor.vv v16, v16, v20
+; RV32V-NEXT:    vsrl.vi v20, v8, 4
+; RV32V-NEXT:    vand.vv v8, v8, v4
+; RV32V-NEXT:    vand.vv v20, v20, v4
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v20, v8
+; RV32V-NEXT:    vsrl.vi v20, v16, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v0, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vand.vv v20, v20, v0
+; RV32V-NEXT:    vsll.vi v16, v16, 2
+; RV32V-NEXT:    vor.vv v16, v20, v16
+; RV32V-NEXT:    vsrl.vi v20, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v20, v20, v0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v20, v8
+; RV32V-NEXT:    vsrl.vi v20, v16, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v16, v16, v28
+; RV32V-NEXT:    vand.vv v20, v20, v28
+; RV32V-NEXT:    vadd.vv v16, v16, v16
+; RV32V-NEXT:    vor.vv v24, v20, v16
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v28
+; RV32V-NEXT:    vand.vv v16, v16, v28
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    addi a7, sp, 272
+; RV32V-NEXT:    vlse64.v v16, (a7), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a7, sp, 256
+; RV32V-NEXT:    addi t0, sp, 264
+; RV32V-NEXT:    addi a0, sp, 240
+; RV32V-NEXT:    vlse64.v v12, (a7), zero
+; RV32V-NEXT:    vlse64.v v20, (t0), zero
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    vand.vv v16, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a1, sp, 232
+; RV32V-NEXT:    addi a7, sp, 224
+; RV32V-NEXT:    addi t0, sp, 216
+; RV32V-NEXT:    addi a0, sp, 208
+; RV32V-NEXT:    vlse64.v v12, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a7), zero
+; RV32V-NEXT:    vlse64.v v20, (t0), zero
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 200
+; RV32V-NEXT:    addi a1, sp, 192
+; RV32V-NEXT:    addi a7, sp, 184
+; RV32V-NEXT:    addi t0, sp, 176
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 168
+; RV32V-NEXT:    addi a1, sp, 160
+; RV32V-NEXT:    addi a7, sp, 152
+; RV32V-NEXT:    addi t0, sp, 144
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 136
+; RV32V-NEXT:    addi a1, sp, 128
+; RV32V-NEXT:    addi a7, sp, 120
+; RV32V-NEXT:    addi t0, sp, 112
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 104
+; RV32V-NEXT:    addi a1, sp, 96
+; RV32V-NEXT:    addi a7, sp, 88
+; RV32V-NEXT:    addi t0, sp, 80
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 72
+; RV32V-NEXT:    addi a1, sp, 64
+; RV32V-NEXT:    addi a7, sp, 56
+; RV32V-NEXT:    addi t0, sp, 48
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 40
+; RV32V-NEXT:    addi a1, sp, 32
+; RV32V-NEXT:    addi a7, sp, 24
+; RV32V-NEXT:    addi t0, sp, 16
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vx v12, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vand.vx v28, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vand.vx v4, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vand.vx v12, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vand.vx v12, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vand.vx v12, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, a3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vand.vx v12, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s9
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s10
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s11
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, ra
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t2
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vi v12, v8, 2
+; RV32V-NEXT:    vand.vi v16, v8, 1
+; RV32V-NEXT:    vand.vi v20, v8, 4
+; RV32V-NEXT:    vand.vi v8, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v24, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v12, v24, v16
+; RV32V-NEXT:    vmul.vv v16, v24, v20
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v20, v24, v8
+; RV32V-NEXT:    vmul.vv v28, v24, v28
+; RV32V-NEXT:    vmul.vv v4, v24, v4
+; RV32V-NEXT:    vmul.vv v0, v24, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v16, v8, t3
-; RV32V-NEXT:    vsll.vx v16, v16, t4
-; RV32V-NEXT:    vsll.vx v0, v8, t5
-; RV32V-NEXT:    vor.vv v8, v0, v16
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsrl.vx v8, v24, t4
-; RV32V-NEXT:    vand.vx v8, v8, t3
-; RV32V-NEXT:    vsrl.vx v0, v24, t5
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vor.vv v8, v8, v0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v8, v24, t3
-; RV32V-NEXT:    vsll.vx v8, v8, t4
-; RV32V-NEXT:    vsll.vx v0, v24, t5
-; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a4, 1044480
-; RV32V-NEXT:    lui a7, 524288
-; RV32V-NEXT:    li ra, 1
-; RV32V-NEXT:    li a5, 2
-; RV32V-NEXT:    li a6, 4
-; RV32V-NEXT:    li s0, 8
-; RV32V-NEXT:    li s11, 16
-; RV32V-NEXT:    li s10, 32
-; RV32V-NEXT:    li s9, 64
-; RV32V-NEXT:    li s8, 128
-; RV32V-NEXT:    li s7, 256
-; RV32V-NEXT:    li s6, 512
-; RV32V-NEXT:    li s5, 1024
-; RV32V-NEXT:    lui s4, 1
-; RV32V-NEXT:    lui s3, 2
-; RV32V-NEXT:    lui s2, 4
-; RV32V-NEXT:    lui s1, 8
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    lui a1, 64
-; RV32V-NEXT:    lui a2, 128
-; RV32V-NEXT:    lui a3, 256
-; RV32V-NEXT:    lui t1, 512
-; RV32V-NEXT:    lui t0, 1024
-; RV32V-NEXT:    lui t2, 2048
-; RV32V-NEXT:    sw a4, 288(sp)
-; RV32V-NEXT:    lui a4, 4096
-; RV32V-NEXT:    sw zero, 292(sp)
-; RV32V-NEXT:    sw a7, 280(sp)
-; RV32V-NEXT:    sw zero, 284(sp)
-; RV32V-NEXT:    sw zero, 272(sp)
-; RV32V-NEXT:    sw ra, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw a5, 268(sp)
-; RV32V-NEXT:    lui a5, 8192
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw a6, 260(sp)
-; RV32V-NEXT:    lui a6, 16384
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s0, 252(sp)
-; RV32V-NEXT:    lui s0, 32768
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s11, 244(sp)
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s10, 236(sp)
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s9, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s8, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw s7, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s6, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s5, 196(sp)
-; RV32V-NEXT:    slli ra, ra, 11
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw ra, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw s4, 180(sp)
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s3, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw s2, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw s1, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t6, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw a0, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw a1, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw a2, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw a3, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t1, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw t0, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw t2, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a4, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a5, 76(sp)
-; RV32V-NEXT:    lui a2, 8192
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw a6, 68(sp)
-; RV32V-NEXT:    lui t2, 16384
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s0, 60(sp)
-; RV32V-NEXT:    lui a6, 65536
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw a6, 52(sp)
-; RV32V-NEXT:    lui a5, 131072
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw a5, 44(sp)
-; RV32V-NEXT:    lui a4, 262144
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw a4, 36(sp)
-; RV32V-NEXT:    sw a7, 28(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    lui a3, 4080
-; RV32V-NEXT:    addi t0, sp, 288
-; RV32V-NEXT:    vlse64.v v16, (t0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vsrl.vi v8, v24, 24
-; RV32V-NEXT:    vand.vx v8, v8, a3
-; RV32V-NEXT:    vsrl.vi v0, v24, 8
-; RV32V-NEXT:    vand.vv v0, v0, v16
-; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vsrl.vi v0, v24, 24
-; RV32V-NEXT:    vand.vx v0, v0, a3
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vsrl.vi v24, v24, 8
-; RV32V-NEXT:    vand.vv v24, v24, v16
-; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -46270,11 +23191,10 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -46282,424 +23202,457 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vx v0, v16, a3
-; RV32V-NEXT:    vsll.vi v0, v0, 24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v16, v16, v8
-; RV32V-NEXT:    vsll.vi v16, v16, 8
-; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vx v0, v8, a3
-; RV32V-NEXT:    vsll.vi v0, v0, 24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v24, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v28
+; RV32V-NEXT:    vxor.vv v8, v8, v4
+; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v0, v16, v24
-; RV32V-NEXT:    lui t0, 61681
-; RV32V-NEXT:    addi t0, t0, -241
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v16, v8, v16
-; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
-; RV32V-NEXT:    vmv.v.x v8, t0
-; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vsrl.vi v24, v0, 4
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v0, v8
-; RV32V-NEXT:    vmv8r.v v0, v8
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vsll.vi v24, v24, 4
-; RV32V-NEXT:    vor.vv v8, v8, v24
-; RV32V-NEXT:    vsrl.vi v24, v16, 4
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vand.vv v24, v24, v0
-; RV32V-NEXT:    vsll.vi v16, v16, 4
-; RV32V-NEXT:    vor.vv v16, v24, v16
-; RV32V-NEXT:    lui t0, 209715
-; RV32V-NEXT:    addi t0, t0, 819
-; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
-; RV32V-NEXT:    vmv.v.x v0, t0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vsrl.vi v24, v8, 2
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v24, v24, v0
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v24, v8
-; RV32V-NEXT:    vsrl.vi v24, v16, 2
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vand.vv v24, v24, v0
-; RV32V-NEXT:    vsll.vi v16, v16, 2
-; RV32V-NEXT:    vor.vv v24, v24, v16
-; RV32V-NEXT:    lui t0, 349525
-; RV32V-NEXT:    addi t0, t0, 1365
-; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
-; RV32V-NEXT:    vmv.v.x v0, t0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
-; RV32V-NEXT:    vsrl.vi v16, v8, 1
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v16, v16, v0
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v16, v16, v8
-; RV32V-NEXT:    vsrl.vi v8, v24, 1
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v24, v0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vsll.vx v8, v8, a6
+; RV32V-NEXT:    vand.vx v12, v12, a2
+; RV32V-NEXT:    vsll.vx v12, v12, a5
+; RV32V-NEXT:    vor.vv v8, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi t0, sp, 280
-; RV32V-NEXT:    vlse64.v v0, (t0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v16, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v24, v8
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vadd.vv v24, v24, v24
-; RV32V-NEXT:    vor.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi t0, sp, 272
-; RV32V-NEXT:    addi t1, sp, 264
-; RV32V-NEXT:    addi a1, sp, 256
-; RV32V-NEXT:    addi a0, sp, 248
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a7, vlenb
-; RV32V-NEXT:    slli a7, a7, 3
-; RV32V-NEXT:    mv t0, a7
-; RV32V-NEXT:    slli a7, a7, 2
-; RV32V-NEXT:    add t0, t0, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t0, t0, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add t0, t0, a7
-; RV32V-NEXT:    slli a7, a7, 1
-; RV32V-NEXT:    add a7, a7, t0
-; RV32V-NEXT:    add a7, sp, a7
-; RV32V-NEXT:    addi a7, a7, 304
-; RV32V-NEXT:    vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (t1), zero
-; RV32V-NEXT:    vlse64.v v24, (a1), zero
-; RV32V-NEXT:    csrr a1, vlenb
-; RV32V-NEXT:    slli a1, a1, 3
-; RV32V-NEXT:    mv a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a7, a7, a1
-; RV32V-NEXT:    slli a1, a1, 2
-; RV32V-NEXT:    add a7, a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a7, a7, a1
-; RV32V-NEXT:    slli a1, a1, 1
-; RV32V-NEXT:    add a1, a1, a7
-; RV32V-NEXT:    add a1, sp, a1
-; RV32V-NEXT:    addi a1, a1, 304
-; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
@@ -46708,300 +23661,239 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 240
-; RV32V-NEXT:    addi a1, sp, 232
-; RV32V-NEXT:    addi t0, sp, 224
-; RV32V-NEXT:    addi t1, sp, 216
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a7, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 208
-; RV32V-NEXT:    addi a1, sp, 200
-; RV32V-NEXT:    addi t0, sp, 192
-; RV32V-NEXT:    addi t1, sp, 184
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a7, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a7, a7, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
+; RV32V-NEXT:    vsrl.vi v28, v12, 8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v28, v28, v4
+; RV32V-NEXT:    vsrl.vi v16, v16, 24
+; RV32V-NEXT:    vand.vx v16, v16, a4
+; RV32V-NEXT:    vor.vv v16, v28, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 176
-; RV32V-NEXT:    addi a1, sp, 168
-; RV32V-NEXT:    addi t0, sp, 160
-; RV32V-NEXT:    addi t1, sp, 152
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a7, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a7
+; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v28, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47009,249 +23901,3356 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v28, v28, v0
+; RV32V-NEXT:    vand.vx v12, v12, a4
+; RV32V-NEXT:    vsll.vi v12, v12, 24
+; RV32V-NEXT:    vand.vv v4, v20, v4
+; RV32V-NEXT:    vsll.vi v4, v4, 8
+; RV32V-NEXT:    vor.vv v12, v12, v4
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v28, v28, v4
+; RV32V-NEXT:    vor.vv v8, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v28, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v28
+; RV32V-NEXT:    vxor.vv v12, v12, v24
+; RV32V-NEXT:    vsrl.vx v20, v20, a5
+; RV32V-NEXT:    vand.vx v20, v20, a2
+; RV32V-NEXT:    vsrl.vx v12, v12, a6
+; RV32V-NEXT:    vor.vv v12, v20, v12
+; RV32V-NEXT:    vor.vv v12, v16, v12
+; RV32V-NEXT:    vor.vv v8, v8, v12
+; RV32V-NEXT:    vsrl.vi v12, v8, 4
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v16
+; RV32V-NEXT:    vand.vv v12, v12, v16
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v12, v8
+; RV32V-NEXT:    vsrl.vi v12, v8, 2
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 144
-; RV32V-NEXT:    addi a1, sp, 136
-; RV32V-NEXT:    addi t0, sp, 128
-; RV32V-NEXT:    addi t1, sp, 120
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v16
+; RV32V-NEXT:    vand.vv v12, v12, v16
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v12, v8
+; RV32V-NEXT:    vsrl.vi v12, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a7, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a7, a7, a0
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a7
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v16
+; RV32V-NEXT:    vand.vv v12, v12, v16
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v12, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 336(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 332(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 328(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 324(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 320(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 312(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 352
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv4i64_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -384
+; RV64V-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s1, 360(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s2, 352(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s3, 344(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s4, 336(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s5, 328(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s6, 320(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s7, 312(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s8, 304(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 296(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s10, 288(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s11, 280(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    lui a5, 16
+; RV64V-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsrl.vi v24, v8, 24
+; RV64V-NEXT:    vsrl.vi v16, v8, 8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    li s3, 255
+; RV64V-NEXT:    lui a0, 61681
+; RV64V-NEXT:    lui a1, 209715
+; RV64V-NEXT:    lui a2, 349525
+; RV64V-NEXT:    vsrl.vi v28, v12, 24
+; RV64V-NEXT:    li a3, 56
+; RV64V-NEXT:    vsrl.vx v16, v8, a3
+; RV64V-NEXT:    li a4, 40
+; RV64V-NEXT:    vsrl.vx v20, v8, a4
+; RV64V-NEXT:    addi t3, a5, -256
+; RV64V-NEXT:    vsrl.vx v4, v12, a3
+; RV64V-NEXT:    vand.vx v20, v20, t3
+; RV64V-NEXT:    vor.vv v20, v20, v16
+; RV64V-NEXT:    vsrl.vx v16, v12, a4
+; RV64V-NEXT:    li t4, 40
+; RV64V-NEXT:    vand.vx v16, v16, t3
+; RV64V-NEXT:    vor.vv v0, v16, v4
+; RV64V-NEXT:    vsrl.vi v16, v12, 8
+; RV64V-NEXT:    li a3, 16
+; RV64V-NEXT:    li a6, 32
+; RV64V-NEXT:    li a5, 64
+; RV64V-NEXT:    li a4, 128
+; RV64V-NEXT:    li t0, 256
+; RV64V-NEXT:    li a7, 512
+; RV64V-NEXT:    li t2, 1
+; RV64V-NEXT:    lui t1, 4080
+; RV64V-NEXT:    vand.vx v4, v24, t1
+; RV64V-NEXT:    slli s3, s3, 24
+; RV64V-NEXT:    vand.vx v24, v28, t1
+; RV64V-NEXT:    vand.vx v16, v16, s3
+; RV64V-NEXT:    vor.vv v16, v16, v24
+; RV64V-NEXT:    vand.vx v24, v12, t1
+; RV64V-NEXT:    lui t5, 4080
+; RV64V-NEXT:    vsll.vi v28, v24, 24
+; RV64V-NEXT:    vor.vv v24, v16, v0
+; RV64V-NEXT:    vand.vx v16, v12, s3
+; RV64V-NEXT:    vsll.vi v16, v16, 8
+; RV64V-NEXT:    vor.vv v16, v28, v16
+; RV64V-NEXT:    li t1, 56
+; RV64V-NEXT:    vsll.vx v28, v12, t1
+; RV64V-NEXT:    vand.vx v12, v12, t3
+; RV64V-NEXT:    mv s0, t3
+; RV64V-NEXT:    sd t3, 224(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vsll.vx v12, v12, t4
+; RV64V-NEXT:    vor.vv v12, v28, v12
+; RV64V-NEXT:    vand.vx v0, v8, t5
+; RV64V-NEXT:    vor.vv v12, v12, v16
+; RV64V-NEXT:    vsll.vx v28, v8, t1
+; RV64V-NEXT:    addi t3, a0, -241
+; RV64V-NEXT:    addi t5, a1, 819
+; RV64V-NEXT:    addi t6, a2, 1365
+; RV64V-NEXT:    slli a0, t2, 11
+; RV64V-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 31
+; RV64V-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 32
+; RV64V-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 33
+; RV64V-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 34
+; RV64V-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 35
+; RV64V-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 36
+; RV64V-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 37
+; RV64V-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 38
+; RV64V-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 39
+; RV64V-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 40
+; RV64V-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t3, 32
+; RV64V-NEXT:    add t3, t3, a0
+; RV64V-NEXT:    slli a0, t5, 32
+; RV64V-NEXT:    add t5, t5, a0
+; RV64V-NEXT:    slli a0, t6, 32
+; RV64V-NEXT:    add a0, t6, a0
+; RV64V-NEXT:    slli a1, t2, 41
+; RV64V-NEXT:    sd a1, 128(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vor.vv v12, v12, v24
+; RV64V-NEXT:    vsrl.vi v16, v12, 4
+; RV64V-NEXT:    vand.vx v12, v12, t3
+; RV64V-NEXT:    vand.vx v16, v16, t3
+; RV64V-NEXT:    vsll.vi v12, v12, 4
+; RV64V-NEXT:    vor.vv v12, v16, v12
+; RV64V-NEXT:    vsrl.vi v16, v12, 2
+; RV64V-NEXT:    vand.vx v12, v12, t5
+; RV64V-NEXT:    vand.vx v16, v16, t5
+; RV64V-NEXT:    vsll.vi v12, v12, 2
+; RV64V-NEXT:    vor.vv v12, v16, v12
+; RV64V-NEXT:    vsrl.vi v16, v12, 1
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vadd.vv v12, v12, v12
+; RV64V-NEXT:    vor.vv v24, v16, v12
+; RV64V-NEXT:    vand.vx v12, v24, a3
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 272
+; RV64V-NEXT:    vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    slli a1, t2, 42
+; RV64V-NEXT:    sd a1, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 272
+; RV64V-NEXT:    vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    sd s3, 232(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v12, s3
+; RV64V-NEXT:    vor.vv v12, v12, v4
+; RV64V-NEXT:    vand.vx v16, v24, a6
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 272
+; RV64V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    slli a1, t2, 43
+; RV64V-NEXT:    sd a1, 112(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vsll.vi v16, v0, 24
+; RV64V-NEXT:    vor.vv v12, v12, v20
+; RV64V-NEXT:    vand.vx v20, v8, s3
+; RV64V-NEXT:    vsll.vi v20, v20, 8
+; RV64V-NEXT:    vor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v0, v24, a5
+; RV64V-NEXT:    slli a1, t2, 44
+; RV64V-NEXT:    sd a1, 104(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, s0
+; RV64V-NEXT:    vsll.vx v8, v8, t4
+; RV64V-NEXT:    vor.vv v8, v28, v8
+; RV64V-NEXT:    vand.vx v20, v24, a4
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 272
+; RV64V-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    slli a1, t2, 45
+; RV64V-NEXT:    sd a1, 96(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vand.vx v16, v24, t0
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 272
+; RV64V-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    slli a1, t2, 46
+; RV64V-NEXT:    sd a1, 88(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vor.vv v8, v8, v12
+; RV64V-NEXT:    vsrl.vi v12, v8, 4
+; RV64V-NEXT:    sd t3, 240(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, t3
+; RV64V-NEXT:    vand.vx v12, v12, t3
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 2
+; RV64V-NEXT:    sd t5, 248(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, t5
+; RV64V-NEXT:    vand.vx v12, v12, t5
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 1
+; RV64V-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vand.vx v12, v24, a7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 47
+; RV64V-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 48
+; RV64V-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 49
+; RV64V-NEXT:    sd a0, 64(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 50
+; RV64V-NEXT:    sd a0, 56(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 51
+; RV64V-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 52
+; RV64V-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 53
+; RV64V-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 54
+; RV64V-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli ra, t2, 55
+; RV64V-NEXT:    slli s11, t2, 56
+; RV64V-NEXT:    slli s10, t2, 57
+; RV64V-NEXT:    slli s9, t2, 58
+; RV64V-NEXT:    slli s8, t2, 59
+; RV64V-NEXT:    slli s6, t2, 60
+; RV64V-NEXT:    slli s7, t2, 61
+; RV64V-NEXT:    slli s5, t2, 62
+; RV64V-NEXT:    li a0, -1
+; RV64V-NEXT:    slli s4, a0, 63
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    lui a1, 1
+; RV64V-NEXT:    lui a2, 2
+; RV64V-NEXT:    lui a3, 4
+; RV64V-NEXT:    lui a4, 8
+; RV64V-NEXT:    lui a5, 32
+; RV64V-NEXT:    lui a6, 64
+; RV64V-NEXT:    lui a7, 128
+; RV64V-NEXT:    lui t0, 256
+; RV64V-NEXT:    lui t1, 512
+; RV64V-NEXT:    lui t2, 1024
+; RV64V-NEXT:    lui t3, 2048
+; RV64V-NEXT:    lui t4, 4096
+; RV64V-NEXT:    lui t5, 8192
+; RV64V-NEXT:    lui t6, 16384
+; RV64V-NEXT:    lui s0, 32768
+; RV64V-NEXT:    lui s1, 65536
+; RV64V-NEXT:    lui s2, 131072
+; RV64V-NEXT:    lui s3, 262144
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv s8, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add s8, s8, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, s8
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv s8, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, s8
+; RV64V-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, a1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, a2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, a3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, a4
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, a5
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, a6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, a7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, t0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, t1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, t2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, t3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, t4
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, t5
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, t6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v12, v24, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, ra
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s11
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s9
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v12, v24, s6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vi v12, v24, 2
+; RV64V-NEXT:    vand.vi v16, v24, 1
+; RV64V-NEXT:    vand.vi v20, v24, 4
+; RV64V-NEXT:    vand.vi v28, v24, 8
+; RV64V-NEXT:    vand.vx v4, v24, s7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v4, v24, s5
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vand.vx v24, v24, s4
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v12, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v16, v8, v20
+; RV64V-NEXT:    vmul.vv v20, v8, v28
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v24, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v28, v8, v12
+; RV64V-NEXT:    vmul.vv v4, v8, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    addi a0, sp, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v8, v12
+; RV64V-NEXT:    vxor.vv v12, v12, v16
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vxor.vv v12, v12, v24
+; RV64V-NEXT:    vxor.vv v12, v12, v28
+; RV64V-NEXT:    vxor.vv v12, v12, v4
+; RV64V-NEXT:    vxor.vv v12, v12, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v12, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v16, v8
+; RV64V-NEXT:    addi a0, sp, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v8
+; RV64V-NEXT:    li a2, 56
+; RV64V-NEXT:    vsll.vx v12, v12, a2
+; RV64V-NEXT:    ld a1, 224(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v16, a1
+; RV64V-NEXT:    li a0, 40
+; RV64V-NEXT:    vsll.vx v16, v16, a0
+; RV64V-NEXT:    vor.vv v12, v12, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v16, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 8
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v20, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v16, v20
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v20, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v24, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v20, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v28, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v28, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    vsrl.vi v28, v16, 8
+; RV64V-NEXT:    ld a4, 232(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v28, v28, a4
+; RV64V-NEXT:    vsrl.vi v20, v20, 24
+; RV64V-NEXT:    lui a3, 4080
+; RV64V-NEXT:    vand.vx v20, v20, a3
+; RV64V-NEXT:    vor.vv v20, v28, v20
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 5
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 272
+; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 272
+; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 272
+; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 272
+; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 272
+; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 5
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 272
+; RV64V-NEXT:    vl4r.v v28, (a5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v28, v24, v28
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 5
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 272
+; RV64V-NEXT:    vl4r.v v4, (a5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v28, v28, v4
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vsll.vi v16, v16, 24
+; RV64V-NEXT:    vand.vx v4, v24, a4
+; RV64V-NEXT:    vsll.vi v4, v4, 8
+; RV64V-NEXT:    vor.vv v16, v16, v4
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v4, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v28, v28, v4
+; RV64V-NEXT:    vor.vv v12, v12, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 272
+; RV64V-NEXT:    vl4r.v v16, (a3) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v28, v16
+; RV64V-NEXT:    vsrl.vx v24, v24, a0
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v28
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v28
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v28
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 272
+; RV64V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vx v8, v8, a2
+; RV64V-NEXT:    vor.vv v8, v24, v8
+; RV64V-NEXT:    vor.vv v8, v20, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 4
+; RV64V-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 2
+; RV64V-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 1
+; RV64V-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s1, 360(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s2, 352(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s3, 344(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s4, 336(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s5, 328(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s6, 320(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s7, 312(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s8, 304(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s9, 296(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s10, 288(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s11, 280(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    addi sp, sp, 384
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv4i64_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v12
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv4i64_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v12
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 4 x i64> %va to <vscale x 4 x i128>
+  %vb.ext = zext <vscale x 4 x i64> %vb to <vscale x 4 x i128>
+  %clmul = call <vscale x 4 x i128> @llvm.clmul.nxv4i128(<vscale x 4 x i128> %va.ext, <vscale x 4 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 4 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 4 x i128> %res.ext to <vscale x 4 x i64>
+  ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 4 x i64> @clmulh_nxv4i64_vx(<vscale x 4 x i64> %va, i64 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv4i64_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -368
+; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 352(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 348(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 344(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 340(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 336(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 2
+; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a3, a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a3, a3, a2
+; RV32V-NEXT:    slli a2, a2, 1
+; RV32V-NEXT:    add a3, a3, a2
+; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    add a2, a2, a3
+; RV32V-NEXT:    sub sp, sp, a2
+; RV32V-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vmv4r.v v0, v8
+; RV32V-NEXT:    sw a0, 16(sp)
+; RV32V-NEXT:    sw a1, 20(sp)
+; RV32V-NEXT:    addi s10, sp, 16
+; RV32V-NEXT:    lui s11, 1044480
+; RV32V-NEXT:    lui s0, 524288
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    li ra, 2
+; RV32V-NEXT:    li t5, 4
+; RV32V-NEXT:    li t3, 8
+; RV32V-NEXT:    li t6, 16
+; RV32V-NEXT:    li t4, 32
+; RV32V-NEXT:    li t2, 64
+; RV32V-NEXT:    li t1, 128
+; RV32V-NEXT:    li t0, 256
+; RV32V-NEXT:    li a7, 512
+; RV32V-NEXT:    li a3, 1024
+; RV32V-NEXT:    lui a2, 1
+; RV32V-NEXT:    lui a4, 2
+; RV32V-NEXT:    lui a1, 4
+; RV32V-NEXT:    lui a5, 8
+; RV32V-NEXT:    lui s1, 16
+; RV32V-NEXT:    lui a6, 32
+; RV32V-NEXT:    lui s2, 64
+; RV32V-NEXT:    lui s3, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    lui s5, 512
+; RV32V-NEXT:    lui s6, 1024
+; RV32V-NEXT:    lui s7, 2048
+; RV32V-NEXT:    lui s8, 4096
+; RV32V-NEXT:    lui s9, 8192
+; RV32V-NEXT:    vlse64.v v4, (s10), zero
+; RV32V-NEXT:    lui s10, 16384
+; RV32V-NEXT:    sw s11, 264(sp)
+; RV32V-NEXT:    lui s11, 32768
+; RV32V-NEXT:    sw zero, 268(sp)
+; RV32V-NEXT:    sw s0, 24(sp)
+; RV32V-NEXT:    sw zero, 28(sp)
+; RV32V-NEXT:    sw zero, 288(sp)
+; RV32V-NEXT:    sw a0, 292(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw ra, 276(sp)
+; RV32V-NEXT:    lui ra, 65536
+; RV32V-NEXT:    sw zero, 280(sp)
+; RV32V-NEXT:    sw t5, 284(sp)
+; RV32V-NEXT:    lui t5, 131072
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw t3, 260(sp)
+; RV32V-NEXT:    lui t3, 262144
+; RV32V-NEXT:    sw zero, 248(sp)
+; RV32V-NEXT:    sw t6, 252(sp)
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw t4, 244(sp)
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw t2, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw t1, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw t0, 220(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw a7, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw a3, 204(sp)
+; RV32V-NEXT:    slli a3, a0, 11
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw a3, 196(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw a2, 188(sp)
+; RV32V-NEXT:    lui t1, 1
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw a4, 180(sp)
+; RV32V-NEXT:    lui t4, 2
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw a1, 172(sp)
+; RV32V-NEXT:    lui t2, 4
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw a5, 164(sp)
+; RV32V-NEXT:    lui t6, 8
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s1, 156(sp)
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw a6, 148(sp)
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw s2, 140(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw s3, 132(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw s4, 124(sp)
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw s5, 116(sp)
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw s6, 108(sp)
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw s7, 100(sp)
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw s8, 92(sp)
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw s9, 84(sp)
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw s10, 76(sp)
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw s11, 68(sp)
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw ra, 60(sp)
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw t5, 52(sp)
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw t3, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw s0, 36(sp)
+; RV32V-NEXT:    addi a1, sp, 264
+; RV32V-NEXT:    vlse64.v v12, (a1), zero
+; RV32V-NEXT:    li a6, 56
+; RV32V-NEXT:    vsrl.vx v16, v8, a6
+; RV32V-NEXT:    li a5, 40
+; RV32V-NEXT:    vsrl.vx v20, v8, a5
+; RV32V-NEXT:    vsll.vx v24, v8, a6
+; RV32V-NEXT:    addi a2, s1, -256
+; RV32V-NEXT:    vand.vx v20, v20, a2
+; RV32V-NEXT:    vand.vx v28, v8, a2
+; RV32V-NEXT:    vor.vv v8, v20, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vsll.vx v16, v28, a5
+; RV32V-NEXT:    vor.vv v8, v24, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vmv.v.v v20, v4
+; RV32V-NEXT:    vsrl.vx v24, v4, a6
+; RV32V-NEXT:    vsrl.vx v28, v4, a5
+; RV32V-NEXT:    vsll.vx v4, v4, a6
+; RV32V-NEXT:    vand.vx v28, v28, a2
+; RV32V-NEXT:    vor.vv v28, v28, v24
+; RV32V-NEXT:    vand.vx v24, v20, a2
+; RV32V-NEXT:    vsll.vx v24, v24, a5
+; RV32V-NEXT:    vor.vv v8, v4, v24
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vmv4r.v v8, v0
+; RV32V-NEXT:    vsrl.vi v4, v0, 24
+; RV32V-NEXT:    lui a4, 4080
+; RV32V-NEXT:    vand.vx v4, v4, a4
+; RV32V-NEXT:    vsrl.vi v0, v0, 8
+; RV32V-NEXT:    vand.vv v0, v0, v12
+; RV32V-NEXT:    vor.vv v4, v0, v4
+; RV32V-NEXT:    vsrl.vi v0, v20, 24
+; RV32V-NEXT:    vand.vx v0, v0, a4
+; RV32V-NEXT:    vsrl.vi v16, v20, 8
+; RV32V-NEXT:    vand.vv v16, v16, v12
+; RV32V-NEXT:    vor.vv v16, v16, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v24, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vor.vv v24, v4, v24
+; RV32V-NEXT:    vand.vx v4, v8, a4
+; RV32V-NEXT:    vsll.vi v4, v4, 24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v8, v12
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v4, v4, v8
+; RV32V-NEXT:    lui a7, 61681
+; RV32V-NEXT:    addi a7, a7, -241
+; RV32V-NEXT:    vor.vv v8, v16, v28
+; RV32V-NEXT:    vand.vx v16, v20, a4
+; RV32V-NEXT:    vsll.vi v16, v16, 24
+; RV32V-NEXT:    vand.vv v12, v20, v12
+; RV32V-NEXT:    vsll.vi v12, v12, 8
+; RV32V-NEXT:    vor.vv v12, v16, v12
+; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v0, a7
+; RV32V-NEXT:    lui a7, 209715
+; RV32V-NEXT:    addi a7, a7, 819
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vor.vv v16, v16, v4
+; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v4, a7
+; RV32V-NEXT:    lui a7, 349525
+; RV32V-NEXT:    addi a7, a7, 1365
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 112
-; RV32V-NEXT:    addi a1, sp, 104
-; RV32V-NEXT:    addi t0, sp, 96
-; RV32V-NEXT:    addi t1, sp, 88
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a7, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a7
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vsetvli t0, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vor.vv v28, v20, v12
+; RV32V-NEXT:    vsetvli t0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmv.v.x v12, a7
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
@@ -47263,8 +27262,13 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vsetvli a7, zero, e64, m4, ta, ma
+; RV32V-NEXT:    vor.vv v24, v16, v24
+; RV32V-NEXT:    addi a7, sp, 24
+; RV32V-NEXT:    vlse64.v v12, (a7), zero
+; RV32V-NEXT:    vor.vv v8, v28, v8
+; RV32V-NEXT:    vsrl.vi v16, v24, 4
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
@@ -47274,37 +27278,39 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v0, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v20, v24, v0
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vsll.vi v20, v20, 4
+; RV32V-NEXT:    vor.vv v16, v16, v20
+; RV32V-NEXT:    vsrl.vi v20, v8, 4
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v20, v20, v0
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v20, v8
+; RV32V-NEXT:    vsrl.vi v20, v16, 2
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v16, v16, v4
+; RV32V-NEXT:    vand.vv v20, v20, v4
+; RV32V-NEXT:    vsll.vi v16, v16, 2
+; RV32V-NEXT:    vor.vv v16, v20, v16
+; RV32V-NEXT:    vsrl.vi v20, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v4
+; RV32V-NEXT:    vand.vv v20, v20, v4
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v20, v8
+; RV32V-NEXT:    vsrl.vi v20, v16, 1
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
@@ -47316,162 +27322,153 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v16, v16, v28
+; RV32V-NEXT:    vand.vv v20, v20, v28
+; RV32V-NEXT:    vadd.vv v16, v16, v16
+; RV32V-NEXT:    vor.vv v24, v20, v16
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v28
+; RV32V-NEXT:    vand.vv v16, v16, v28
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    addi a7, sp, 288
+; RV32V-NEXT:    vlse64.v v16, (a7), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a7, sp, 272
+; RV32V-NEXT:    addi t0, sp, 280
+; RV32V-NEXT:    addi a0, sp, 256
+; RV32V-NEXT:    vlse64.v v12, (a7), zero
+; RV32V-NEXT:    vlse64.v v20, (t0), zero
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    vand.vv v16, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 80
-; RV32V-NEXT:    addi a1, sp, 72
-; RV32V-NEXT:    addi t0, sp, 64
-; RV32V-NEXT:    addi t1, sp, 56
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a7, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a7, a7, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a7
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a1, sp, 248
+; RV32V-NEXT:    addi a7, sp, 240
+; RV32V-NEXT:    addi t0, sp, 232
+; RV32V-NEXT:    addi a0, sp, 224
+; RV32V-NEXT:    vlse64.v v12, (a1), zero
+; RV32V-NEXT:    vlse64.v v16, (a7), zero
+; RV32V-NEXT:    vlse64.v v20, (t0), zero
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 216
+; RV32V-NEXT:    addi a1, sp, 208
+; RV32V-NEXT:    addi a7, sp, 200
+; RV32V-NEXT:    addi t0, sp, 192
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 48
-; RV32V-NEXT:    addi a1, sp, 40
-; RV32V-NEXT:    addi t0, sp, 32
-; RV32V-NEXT:    addi t1, sp, 24
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a7, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a7
+; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v0, (a1), zero
-; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47480,46 +27477,13 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vlse64.v v24, (t1), zero
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vv v24, v8, v0
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
@@ -47533,12 +27497,20 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 184
+; RV32V-NEXT:    addi a1, sp, 176
+; RV32V-NEXT:    addi a7, sp, 168
+; RV32V-NEXT:    addi t0, sp, 160
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47548,7 +27520,8 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
@@ -47560,306 +27533,393 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s11
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s10
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 152
+; RV32V-NEXT:    addi a1, sp, 144
+; RV32V-NEXT:    addi a7, sp, 136
+; RV32V-NEXT:    addi t0, sp, 128
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s9
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s8
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s7
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s6
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 120
+; RV32V-NEXT:    addi a1, sp, 112
+; RV32V-NEXT:    addi a7, sp, 104
+; RV32V-NEXT:    addi t0, sp, 96
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s5
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, ra
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s4
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s3
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 88
+; RV32V-NEXT:    addi a1, sp, 80
+; RV32V-NEXT:    addi a7, sp, 72
+; RV32V-NEXT:    addi t0, sp, 64
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s2
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s1
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, t6
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 32
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 56
+; RV32V-NEXT:    addi a1, sp, 48
+; RV32V-NEXT:    addi a7, sp, 40
+; RV32V-NEXT:    addi t0, sp, 32
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    vlse64.v v16, (a1), zero
+; RV32V-NEXT:    vlse64.v v20, (a7), zero
+; RV32V-NEXT:    vlse64.v v28, (t0), zero
+; RV32V-NEXT:    vand.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 64
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 128
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 256
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vv v12, v8, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 512
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vx v12, v8, a0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 1024
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vand.vx v28, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vand.vx v4, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v0, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vand.vx v12, v8, a0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 2048
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vand.vx v12, v8, a0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    lui a0, 4096
-; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vand.vx v12, v8, a0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, a3
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, t2
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, s0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t4
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a6
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t2
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a5
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t6
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vx v24, v8, a4
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vi v24, v8, 2
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    slli a0, a0, 7
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vi v0, v8, 1
-; RV32V-NEXT:    vand.vi v24, v8, 4
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vand.vx v12, v8, a0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vand.vi v8, v8, 8
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s2
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47867,10 +27927,10 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s3
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47880,10 +27940,10 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v24, v16, v0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s4
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47891,84 +27951,108 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s5
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s6
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s7
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v0, v16, v8
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s9
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s10
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, s11
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, ra
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vx v12, v8, t3
+; RV32V-NEXT:    addi a0, sp, 304
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vand.vi v12, v8, 2
+; RV32V-NEXT:    vand.vi v16, v8, 1
+; RV32V-NEXT:    vand.vi v20, v8, 4
+; RV32V-NEXT:    vand.vi v8, v8, 8
+; RV32V-NEXT:    vmul.vv v12, v24, v12
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v12, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v12, v24, v16
+; RV32V-NEXT:    vmul.vv v16, v24, v20
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47978,18 +28062,32 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v20, v24, v8
+; RV32V-NEXT:    vmul.vv v28, v24, v28
+; RV32V-NEXT:    vmul.vv v4, v24, v4
+; RV32V-NEXT:    vmul.vv v0, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -47997,22 +28095,22 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
@@ -48020,39 +28118,39 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -48060,68 +28158,68 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -48131,15 +28229,15 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 7
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -48147,22 +28245,24 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
@@ -48170,39 +28270,43 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -48210,152 +28314,84 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
@@ -48363,393 +28399,359 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 304
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
@@ -48759,12 +28761,12 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
@@ -48774,318 +28776,326 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49094,13 +29104,14 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49108,96 +29119,135 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v16, v16, v8
-; RV32V-NEXT:    vxor.vi v8, v24, 0
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v0
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v24, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49207,10 +29257,14 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v28
+; RV32V-NEXT:    vxor.vv v8, v8, v4
+; RV32V-NEXT:    vxor.vv v8, v8, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49218,10 +29272,10 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49229,19 +29283,19 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49249,34 +29303,34 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49286,10 +29340,10 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49297,10 +29351,10 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49308,19 +29362,19 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49328,34 +29382,34 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49363,244 +29417,216 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vsll.vx v8, v8, a6
+; RV32V-NEXT:    vand.vx v12, v12, a2
+; RV32V-NEXT:    vsll.vx v12, v12, a5
+; RV32V-NEXT:    vor.vv v8, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v16, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v12, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
@@ -49610,174 +29636,181 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v16, v20
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
+; RV32V-NEXT:    vsrl.vi v28, v12, 8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v28, v28, v4
+; RV32V-NEXT:    vsrl.vi v16, v16, 24
+; RV32V-NEXT:    vand.vx v16, v16, a4
+; RV32V-NEXT:    vor.vv v16, v28, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v20, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v28, v20, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -49786,120 +29819,141 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vsrl.vx v16, v8, t4
-; RV32V-NEXT:    vand.vx v16, v16, t3
-; RV32V-NEXT:    vsrl.vx v24, v8, t5
-; RV32V-NEXT:    vor.vv v16, v16, v24
+; RV32V-NEXT:    vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v28, v28, v0
+; RV32V-NEXT:    vand.vx v12, v12, a4
+; RV32V-NEXT:    vsll.vi v12, v12, 24
+; RV32V-NEXT:    vand.vv v4, v20, v4
+; RV32V-NEXT:    vsll.vi v4, v4, 8
+; RV32V-NEXT:    vor.vv v12, v12, v4
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; RV32V-NEXT:    vsrl.vi v24, v8, 24
-; RV32V-NEXT:    vand.vx v24, v24, a3
-; RV32V-NEXT:    vsrl.vi v0, v8, 8
+; RV32V-NEXT:    vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v28, v28, v4
+; RV32V-NEXT:    vor.vv v8, v8, v12
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v0, v0, v16
-; RV32V-NEXT:    vor.vv v24, v0, v24
-; RV32V-NEXT:    vand.vv v0, v8, v16
-; RV32V-NEXT:    vsll.vi v0, v0, 8
-; RV32V-NEXT:    vand.vx v16, v8, a3
-; RV32V-NEXT:    vsll.vi v16, v16, 24
-; RV32V-NEXT:    vor.vv v16, v16, v0
-; RV32V-NEXT:    vsll.vx v0, v8, t5
-; RV32V-NEXT:    vand.vx v8, v8, t3
-; RV32V-NEXT:    vsll.vx v8, v8, t4
-; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    vl4r.v v12, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v28, v12
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vor.vv v24, v24, v0
-; RV32V-NEXT:    vor.vv v8, v8, v16
-; RV32V-NEXT:    vor.vv v8, v8, v24
-; RV32V-NEXT:    vsrl.vi v16, v8, 4
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v16, v16, v24
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v8, 2
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v28
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v12, v12, v28
+; RV32V-NEXT:    vxor.vv v12, v12, v24
+; RV32V-NEXT:    vsrl.vx v20, v20, a5
+; RV32V-NEXT:    vand.vx v20, v20, a2
+; RV32V-NEXT:    vsrl.vx v12, v12, a6
+; RV32V-NEXT:    vor.vv v12, v20, v12
+; RV32V-NEXT:    vor.vv v12, v16, v12
+; RV32V-NEXT:    vor.vv v8, v8, v12
+; RV32V-NEXT:    vsrl.vi v12, v8, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v16
+; RV32V-NEXT:    vand.vv v12, v12, v16
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v12, v8
+; RV32V-NEXT:    vsrl.vi v12, v8, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v16
+; RV32V-NEXT:    vand.vv v12, v12, v16
 ; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v16, v8
-; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vor.vv v8, v12, v8
+; RV32V-NEXT:    vsrl.vi v12, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v24
-; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v16
+; RV32V-NEXT:    vand.vv v12, v12, v16
 ; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vor.vv v8, v12, v8
 ; RV32V-NEXT:    vsrl.vi v8, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add sp, sp, a0
 ; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
@@ -49918,11 +29972,11 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV32V-NEXT:    addi sp, sp, 368
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv8i64_vx:
+; RV64V-LABEL: clmulh_nxv4i64_vx:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    addi sp, sp, -16
 ; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 5
+; RV64V-NEXT:    slli a1, a1, 2
 ; RV64V-NEXT:    sub sp, sp, a1
 ; RV64V-NEXT:    li a1, 56
 ; RV64V-NEXT:    lui t2, 16
@@ -49957,13 +30011,9 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV64V-NEXT:    slli a7, a7, 24
 ; RV64V-NEXT:    or t3, a7, t4
 ; RV64V-NEXT:    li a7, 40
-; RV64V-NEXT:    vsetvli t4, zero, e64, m8, ta, ma
-; RV64V-NEXT:    vsrl.vi v24, v8, 24
-; RV64V-NEXT:    vsrl.vx v16, v8, a1
-; RV64V-NEXT:    vsrl.vx v0, v8, a7
-; RV64V-NEXT:    vand.vx v0, v0, a6
-; RV64V-NEXT:    vor.vv v16, v0, v16
-; RV64V-NEXT:    vsrl.vi v0, v8, 8
+; RV64V-NEXT:    vsetvli t4, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsrl.vi v16, v8, 24
+; RV64V-NEXT:    vsrl.vi v12, v8, 8
 ; RV64V-NEXT:    or t1, t5, t1
 ; RV64V-NEXT:    slli t4, a0, 56
 ; RV64V-NEXT:    and a0, a0, a6
@@ -49972,438 +30022,375 @@ define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nou
 ; RV64V-NEXT:    li a0, 1
 ; RV64V-NEXT:    or t4, t4, t3
 ; RV64V-NEXT:    lui t3, 1
-; RV64V-NEXT:    vand.vx v24, v24, a2
-; RV64V-NEXT:    vand.vx v0, v0, t0
-; RV64V-NEXT:    vor.vv v24, v0, v24
-; RV64V-NEXT:    vand.vx v0, v8, a2
-; RV64V-NEXT:    vsll.vi v0, v0, 24
-; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vsrl.vx v20, v8, a1
+; RV64V-NEXT:    vsrl.vx v24, v8, a7
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vand.vx v28, v8, a2
+; RV64V-NEXT:    vsll.vx v4, v8, a1
+; RV64V-NEXT:    vand.vx v24, v24, a6
+; RV64V-NEXT:    vand.vx v12, v12, t0
+; RV64V-NEXT:    vsll.vi v28, v28, 24
+; RV64V-NEXT:    vor.vv v20, v24, v20
 ; RV64V-NEXT:    vand.vx v24, v8, t0
-; RV64V-NEXT:    vsll.vi v24, v24, 8
-; RV64V-NEXT:    vor.vv v24, v0, v24
-; RV64V-NEXT:    vsll.vx v0, v8, a1
 ; RV64V-NEXT:    vand.vx v8, v8, a6
+; RV64V-NEXT:    vor.vv v12, v12, v16
+; RV64V-NEXT:    vsll.vi v16, v24, 8
 ; RV64V-NEXT:    vsll.vx v8, v8, a7
-; RV64V-NEXT:    vor.vv v8, v0, v8
-; RV64V-NEXT:    vor.vv v8, v8, v24
+; RV64V-NEXT:    vor.vv v12, v12, v20
+; RV64V-NEXT:    vor.vv v16, v28, v16
+; RV64V-NEXT:    vor.vv v8, v4, v8
 ; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vor.vv v8, v8, v12
 ; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vsrl.vi v12, v8, 4
 ; RV64V-NEXT:    vand.vx v8, v8, a5
 ; RV64V-NEXT:    srli t4, t1, 4
 ; RV64V-NEXT:    and t1, t1, a5
-; RV64V-NEXT:    vand.vx v16, v16, a5
+; RV64V-NEXT:    vand.vx v12, v12, a5
 ; RV64V-NEXT:    vsll.vi v8, v8, 4
 ; RV64V-NEXT:    and t4, t4, a5
 ; RV64V-NEXT:    slli t1, t1, 4
-; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
 ; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vsrl.vi v12, v8, 2
 ; RV64V-NEXT:    vand.vx v8, v8, a4
 ; RV64V-NEXT:    srli t4, t1, 2
 ; RV64V-NEXT:    and t1, t1, a4
-; RV64V-NEXT:    vand.vx v16, v16, a4
+; RV64V-NEXT:    vand.vx v12, v12, a4
 ; RV64V-NEXT:    vsll.vi v8, v8, 2
 ; RV64V-NEXT:    and t4, t4, a4
 ; RV64V-NEXT:    slli t1, t1, 2
-; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
 ; RV64V-NEXT:    or t1, t4, t1
-; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vsrl.vi v12, v8, 1
 ; RV64V-NEXT:    vand.vx v8, v8, a3
 ; RV64V-NEXT:    srli t4, t1, 1
 ; RV64V-NEXT:    and t1, t1, a3
-; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vand.vx v12, v12, a3
 ; RV64V-NEXT:    vadd.vv v8, v8, v8
 ; RV64V-NEXT:    and t4, t4, a3
 ; RV64V-NEXT:    slli t1, t1, 1
-; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
 ; RV64V-NEXT:    or t1, t4, t1
 ; RV64V-NEXT:    andi t4, t1, 2
-; RV64V-NEXT:    vmul.vx v16, v8, t4
+; RV64V-NEXT:    vmul.vx v12, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 1
-; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    vmul.vx v16, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 4
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vmul.vx v20, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 8
-; RV64V-NEXT:    vxor.vv v16, v24, v16
 ; RV64V-NEXT:    vmul.vx v24, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 16
-; RV64V-NEXT:    vxor.vv v16, v16, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vmul.vx v28, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 32
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    vmul.vx v4, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 64
-; RV64V-NEXT:    vxor.vv v16, v16, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vxor.vv v12, v16, v12
+; RV64V-NEXT:    vmul.vx v16, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 128
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 256
-; RV64V-NEXT:    vxor.vv v16, v16, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
-; RV64V-NEXT:    andi t4, t1, 512
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    csrr t5, vlenb
-; RV64V-NEXT:    slli t5, t5, 3
-; RV64V-NEXT:    mv t6, t5
-; RV64V-NEXT:    slli t5, t5, 1
-; RV64V-NEXT:    add t5, t5, t6
-; RV64V-NEXT:    add t5, sp, t5
-; RV64V-NEXT:    addi t5, t5, 16
-; RV64V-NEXT:    vs8r.v v16, (t5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vxor.vv v12, v12, v24
 ; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    andi t4, t1, 512
+; RV64V-NEXT:    vxor.vv v12, v12, v28
+; RV64V-NEXT:    vmul.vx v28, v8, t4
 ; RV64V-NEXT:    andi t4, t1, 1024
-; RV64V-NEXT:    vxor.vv v0, v16, v0
-; RV64V-NEXT:    vxor.vv v24, v0, v24
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vxor.vv v4, v12, v4
+; RV64V-NEXT:    vmul.vx v12, v8, t4
 ; RV64V-NEXT:    slli t4, a0, 11
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vxor.vv v4, v4, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t4
 ; RV64V-NEXT:    lui t4, 2
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    vxor.vv v4, v4, v20
+; RV64V-NEXT:    addi t5, sp, 16
+; RV64V-NEXT:    vs4r.v v4, (t5) # vscale x 32-byte Folded Spill
+; RV64V-NEXT:    vmul.vx v20, v8, t3
 ; RV64V-NEXT:    lui t3, 4
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vxor.vv v24, v4, v24
+; RV64V-NEXT:    vxor.vv v28, v24, v28
+; RV64V-NEXT:    vmul.vx v24, v8, t4
 ; RV64V-NEXT:    lui t4, 8
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    vxor.vv v12, v28, v12
+; RV64V-NEXT:    vmul.vx v28, v8, t3
 ; RV64V-NEXT:    lui t3, 32
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vxor.vv v12, v12, v16
+; RV64V-NEXT:    vmul.vx v4, v8, t4
 ; RV64V-NEXT:    lui t4, 64
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vxor.vv v12, v12, v20
 ; RV64V-NEXT:    vmul.vx v0, v8, t2
 ; RV64V-NEXT:    lui t2, 128
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    vxor.vv v16, v12, v24
+; RV64V-NEXT:    vmul.vx v12, v8, t3
 ; RV64V-NEXT:    lui t3, 256
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vxor.vv v20, v16, v28
+; RV64V-NEXT:    vmul.vx v16, v8, t4
 ; RV64V-NEXT:    lui t4, 512
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t2
+; RV64V-NEXT:    vxor.vv v24, v20, v4
+; RV64V-NEXT:    vmul.vx v20, v8, t2
 ; RV64V-NEXT:    lui t2, 1024
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    vxor.vv v28, v24, v0
+; RV64V-NEXT:    vmul.vx v24, v8, t3
 ; RV64V-NEXT:    lui t3, 2048
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    vxor.vv v12, v28, v12
+; RV64V-NEXT:    vmul.vx v28, v8, t4
 ; RV64V-NEXT:    lui t4, 4096
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v12, v16
+; RV64V-NEXT:    vmul.vx v12, v8, t2
 ; RV64V-NEXT:    lui t2, 8192
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    vmul.vx v0, v8, t3
-; RV64V-NEXT:    vxor.vv v0, v24, v0
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    vxor.vv v24, v0, v24
-; RV64V-NEXT:    vmul.vx v16, v8, t2
-; RV64V-NEXT:    vxor.vv v24, v24, v16
-; RV64V-NEXT:    lui t2, 16384
-; RV64V-NEXT:    lui t3, 32768
-; RV64V-NEXT:    lui t4, 65536
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    csrr t5, vlenb
-; RV64V-NEXT:    slli t5, t5, 3
-; RV64V-NEXT:    mv t6, t5
-; RV64V-NEXT:    slli t5, t5, 1
-; RV64V-NEXT:    add t5, t5, t6
-; RV64V-NEXT:    add t5, sp, t5
-; RV64V-NEXT:    addi t5, t5, 16
-; RV64V-NEXT:    vl8r.v v16, (t5) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vsll.vx v16, v16, a1
-; RV64V-NEXT:    vand.vx v0, v0, a6
-; RV64V-NEXT:    vsll.vx v0, v0, a7
-; RV64V-NEXT:    vor.vv v16, v16, v0
-; RV64V-NEXT:    csrr t5, vlenb
-; RV64V-NEXT:    slli t5, t5, 3
-; RV64V-NEXT:    add t5, sp, t5
-; RV64V-NEXT:    addi t5, t5, 16
-; RV64V-NEXT:    vs8r.v v16, (t5) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vmul.vx v16, v8, t2
-; RV64V-NEXT:    lui t2, 131072
-; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v24, v16
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    lui t3, 262144
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t3
+; RV64V-NEXT:    lui t3, 16384
 ; RV64V-NEXT:    and t4, t1, t4
 ; RV64V-NEXT:    vxor.vv v16, v16, v24
 ; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    slli t4, a0, 32
+; RV64V-NEXT:    lui t4, 32768
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    slli t2, a0, 33
+; RV64V-NEXT:    vxor.vv v16, v16, v28
+; RV64V-NEXT:    vmul.vx v28, v8, t2
+; RV64V-NEXT:    lui t2, 65536
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    srliw t3, t1, 31
-; RV64V-NEXT:    slli t3, t3, 31
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    slli t3, a0, 34
+; RV64V-NEXT:    vxor.vv v12, v16, v12
+; RV64V-NEXT:    vmul.vx v16, v8, t3
+; RV64V-NEXT:    lui t3, 131072
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    slli t4, a0, 35
+; RV64V-NEXT:    vxor.vv v12, v12, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    lui t4, 262144
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v24, v12, v24
+; RV64V-NEXT:    vxor.vv v4, v24, v28
 ; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    slli t2, a0, 36
+; RV64V-NEXT:    slli t2, a0, 32
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    slli t3, a0, 37
+; RV64V-NEXT:    vl4r.v v28, (t5) # vscale x 32-byte Folded Reload
+; RV64V-NEXT:    vsll.vx v28, v28, a1
+; RV64V-NEXT:    vand.vx v12, v12, a6
+; RV64V-NEXT:    vsll.vx v12, v12, a7
+; RV64V-NEXT:    vor.vv v12, v28, v12
+; RV64V-NEXT:    vmul.vx v28, v8, t3
+; RV64V-NEXT:    slli t3, a0, 33
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    slli t4, a0, 38
+; RV64V-NEXT:    vxor.vv v16, v4, v16
+; RV64V-NEXT:    vmul.vx v4, v8, t4
+; RV64V-NEXT:    slli t4, a0, 34
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    slli t2, a0, 39
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t2
+; RV64V-NEXT:    slli t2, a0, 35
 ; RV64V-NEXT:    and t3, t1, t3
 ; RV64V-NEXT:    vxor.vv v16, v16, v24
 ; RV64V-NEXT:    vmul.vx v24, v8, t3
-; RV64V-NEXT:    slli t3, a0, 40
+; RV64V-NEXT:    slli t3, a0, 36
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    slli t4, a0, 41
+; RV64V-NEXT:    vxor.vv v28, v16, v28
+; RV64V-NEXT:    vmul.vx v16, v8, t4
+; RV64V-NEXT:    srliw t4, t1, 31
+; RV64V-NEXT:    slli t4, t4, 31
+; RV64V-NEXT:    vxor.vv v28, v28, v4
+; RV64V-NEXT:    vmul.vx v4, v8, t4
+; RV64V-NEXT:    slli t4, a0, 37
 ; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v4, v28, v4
+; RV64V-NEXT:    vmul.vx v28, v8, t2
+; RV64V-NEXT:    slli t2, a0, 38
 ; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v4, v4, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t3
+; RV64V-NEXT:    slli t3, a0, 39
 ; RV64V-NEXT:    and t4, t1, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v24, v16, v24
-; RV64V-NEXT:    csrr t2, vlenb
-; RV64V-NEXT:    slli t2, t2, 3
-; RV64V-NEXT:    mv t5, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add t2, t2, t5
-; RV64V-NEXT:    add t2, sp, t2
-; RV64V-NEXT:    addi t2, t2, 16
-; RV64V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    vmul.vx v16, v8, t3
-; RV64V-NEXT:    vxor.vv v16, v24, v16
+; RV64V-NEXT:    vxor.vv v4, v4, v24
 ; RV64V-NEXT:    vmul.vx v24, v8, t4
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 42
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 43
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 44
+; RV64V-NEXT:    slli t4, a0, 40
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 45
+; RV64V-NEXT:    vxor.vv v4, v4, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t2
+; RV64V-NEXT:    slli t2, a0, 41
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v4, v4, v28
+; RV64V-NEXT:    vmul.vx v28, v8, t3
+; RV64V-NEXT:    slli t3, a0, 42
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v4, v4, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t4
+; RV64V-NEXT:    slli t4, a0, 43
 ; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v4, v4, v24
 ; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 46
+; RV64V-NEXT:    slli t2, a0, 44
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v4, v16
+; RV64V-NEXT:    vmul.vx v4, v8, t3
+; RV64V-NEXT:    slli t3, a0, 45
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v28
+; RV64V-NEXT:    vmul.vx v28, v8, t4
+; RV64V-NEXT:    slli t4, a0, 46
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v20, v16, v20
+; RV64V-NEXT:    vxor.vv v24, v20, v24
+; RV64V-NEXT:    vmul.vx v20, v8, t2
 ; RV64V-NEXT:    slli t2, a0, 47
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 48
-; RV64V-NEXT:    slli t3, a0, 49
-; RV64V-NEXT:    and t2, t1, t2
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v24, v16, v24
-; RV64V-NEXT:    vmul.vx v0, v8, t3
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    slli t2, a0, 50
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v0, v8, t2
-; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    csrr t2, vlenb
-; RV64V-NEXT:    slli t2, t2, 3
-; RV64V-NEXT:    mv t3, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add t2, t2, t3
-; RV64V-NEXT:    add t2, sp, t2
-; RV64V-NEXT:    addi t2, t2, 16
-; RV64V-NEXT:    vl8r.v v0, (t2) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vsrl.vi v0, v0, 8
-; RV64V-NEXT:    vand.vx v0, v0, t0
-; RV64V-NEXT:    vsrl.vi v16, v16, 24
-; RV64V-NEXT:    vand.vx v16, v16, a2
-; RV64V-NEXT:    vor.vv v16, v0, v16
-; RV64V-NEXT:    addi t2, sp, 16
-; RV64V-NEXT:    vs8r.v v16, (t2) # vscale x 64-byte Folded Spill
-; RV64V-NEXT:    slli t2, a0, 51
+; RV64V-NEXT:    vxor.vv v4, v24, v4
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    slli t3, a0, 48
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v4, v4, v28
+; RV64V-NEXT:    vmul.vx v28, v8, t4
+; RV64V-NEXT:    slli t4, a0, 49
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v16, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v24, v16
-; RV64V-NEXT:    slli t2, a0, 52
+; RV64V-NEXT:    vxor.vv v4, v4, v20
+; RV64V-NEXT:    vmul.vx v20, v8, t2
+; RV64V-NEXT:    slli t2, a0, 50
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v4, v4, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    slli t3, a0, 51
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v28, v4, v28
+; RV64V-NEXT:    vmul.vx v4, v8, t4
+; RV64V-NEXT:    slli t4, a0, 52
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vxor.vv v20, v28, v20
+; RV64V-NEXT:    vmul.vx v28, v8, t2
 ; RV64V-NEXT:    slli t2, a0, 53
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v24, v20, v24
+; RV64V-NEXT:    vxor.vv v4, v24, v4
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    slli t3, a0, 54
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v28, v4, v28
+; RV64V-NEXT:    vsrl.vi v4, v16, 8
+; RV64V-NEXT:    vand.vx v4, v4, t0
+; RV64V-NEXT:    vsrl.vi v20, v20, 24
+; RV64V-NEXT:    vand.vx v20, v20, a2
+; RV64V-NEXT:    vor.vv v20, v4, v20
+; RV64V-NEXT:    vmul.vx v4, v8, t4
+; RV64V-NEXT:    slli t4, a0, 55
 ; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 54
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    slli t2, a0, 55
-; RV64V-NEXT:    and t2, t1, t2
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v16, v16, v24
-; RV64V-NEXT:    csrr t2, vlenb
-; RV64V-NEXT:    slli t2, t2, 4
-; RV64V-NEXT:    add t2, sp, t2
-; RV64V-NEXT:    addi t2, t2, 16
-; RV64V-NEXT:    vs8r.v v16, (t2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vxor.vv v24, v28, v24
+; RV64V-NEXT:    vmul.vx v28, v8, t2
 ; RV64V-NEXT:    slli t2, a0, 56
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v24, v24, v4
+; RV64V-NEXT:    vmul.vx v4, v8, t3
 ; RV64V-NEXT:    slli t3, a0, 57
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v24, v24, v28
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    slli t4, a0, 58
 ; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v24, v24, v4
+; RV64V-NEXT:    vmul.vx v28, v8, t2
+; RV64V-NEXT:    slli t2, a0, 59
 ; RV64V-NEXT:    and t3, t1, t3
-; RV64V-NEXT:    vmul.vx v24, v8, t2
-; RV64V-NEXT:    vxor.vv v24, v16, v24
-; RV64V-NEXT:    vmul.vx v0, v8, t3
 ; RV64V-NEXT:    vxor.vv v24, v24, v0
-; RV64V-NEXT:    csrr t2, vlenb
-; RV64V-NEXT:    slli t2, t2, 3
-; RV64V-NEXT:    mv t3, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add t2, t2, t3
-; RV64V-NEXT:    add t2, sp, t2
-; RV64V-NEXT:    addi t2, t2, 16
-; RV64V-NEXT:    vl8r.v v16, (t2) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vand.vx v0, v16, a2
-; RV64V-NEXT:    vsll.vi v0, v0, 24
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 4
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 16
-; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vand.vx v16, v16, t0
-; RV64V-NEXT:    vsll.vi v16, v16, 8
-; RV64V-NEXT:    vor.vv v16, v0, v16
-; RV64V-NEXT:    slli a2, a0, 58
-; RV64V-NEXT:    and a2, t1, a2
-; RV64V-NEXT:    vmul.vx v0, v8, a2
-; RV64V-NEXT:    vxor.vv v0, v24, v0
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 16
-; RV64V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v24, v24, v16
-; RV64V-NEXT:    slli a2, a0, 59
-; RV64V-NEXT:    and a2, t1, a2
-; RV64V-NEXT:    vmul.vx v16, v8, a2
-; RV64V-NEXT:    vxor.vv v16, v0, v16
-; RV64V-NEXT:    slli a2, a0, 60
-; RV64V-NEXT:    and a2, t1, a2
-; RV64V-NEXT:    vmul.vx v0, v8, a2
-; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    vmul.vx v4, v8, t3
+; RV64V-NEXT:    slli t3, a0, 60
+; RV64V-NEXT:    vand.vx v16, v16, a2
 ; RV64V-NEXT:    slli a2, a0, 61
-; RV64V-NEXT:    and a2, t1, a2
-; RV64V-NEXT:    vmul.vx v0, v8, a2
-; RV64V-NEXT:    vxor.vv v16, v16, v0
 ; RV64V-NEXT:    slli a0, a0, 62
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    and a2, t1, a2
 ; RV64V-NEXT:    and a0, t1, a0
-; RV64V-NEXT:    vmul.vx v0, v8, a0
-; RV64V-NEXT:    vxor.vv v16, v16, v0
-; RV64V-NEXT:    srli a0, t1, 63
-; RV64V-NEXT:    slli a0, a0, 63
-; RV64V-NEXT:    vmul.vx v8, v8, a0
+; RV64V-NEXT:    srli t1, t1, 63
+; RV64V-NEXT:    vsll.vi v16, v16, 24
+; RV64V-NEXT:    vxor.vv v28, v24, v28
+; RV64V-NEXT:    vxor.vv v28, v28, v4
+; RV64V-NEXT:    vand.vx v4, v24, t0
+; RV64V-NEXT:    vsll.vi v4, v4, 8
+; RV64V-NEXT:    vor.vv v16, v16, v4
+; RV64V-NEXT:    vmul.vx v4, v8, t4
+; RV64V-NEXT:    vxor.vv v28, v28, v4
+; RV64V-NEXT:    vmul.vx v4, v8, t2
+; RV64V-NEXT:    vor.vv v12, v12, v16
+; RV64V-NEXT:    vmul.vx v16, v8, t3
+; RV64V-NEXT:    vxor.vv v28, v28, v4
+; RV64V-NEXT:    vmul.vx v4, v8, a2
+; RV64V-NEXT:    vxor.vv v16, v28, v16
+; RV64V-NEXT:    vmul.vx v28, v8, a0
+; RV64V-NEXT:    slli t1, t1, 63
+; RV64V-NEXT:    vmul.vx v8, v8, t1
+; RV64V-NEXT:    vsrl.vx v24, v24, a7
+; RV64V-NEXT:    vand.vx v24, v24, a6
+; RV64V-NEXT:    vxor.vv v16, v16, v4
+; RV64V-NEXT:    vxor.vv v16, v16, v28
 ; RV64V-NEXT:    vxor.vv v8, v16, v8
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 16
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vsrl.vx v16, v16, a7
-; RV64V-NEXT:    vand.vx v16, v16, a6
 ; RV64V-NEXT:    vsrl.vx v8, v8, a1
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    addi a0, sp, 16
-; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64V-NEXT:    vor.vv v8, v16, v8
 ; RV64V-NEXT:    vor.vv v8, v24, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vor.vv v8, v20, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 4
 ; RV64V-NEXT:    vand.vx v8, v8, a5
-; RV64V-NEXT:    vand.vx v16, v16, a5
+; RV64V-NEXT:    vand.vx v12, v12, a5
 ; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 2
 ; RV64V-NEXT:    vand.vx v8, v8, a4
-; RV64V-NEXT:    vand.vx v16, v16, a4
+; RV64V-NEXT:    vand.vx v12, v12, a4
 ; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v16, v8
-; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vor.vv v8, v12, v8
+; RV64V-NEXT:    vsrl.vi v12, v8, 1
 ; RV64V-NEXT:    vand.vx v8, v8, a3
-; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vand.vx v12, v12, a3
 ; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vor.vv v8, v12, v8
 ; RV64V-NEXT:    vsrl.vi v8, v8, 1
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add sp, sp, a0
 ; RV64V-NEXT:    addi sp, sp, 16
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv8i64_vx:
+; RV32ZVBC-LABEL: clmulh_nxv4i64_vx:
 ; RV32ZVBC:       # %bb.0:
 ; RV32ZVBC-NEXT:    addi sp, sp, -16
 ; RV32ZVBC-NEXT:    sw a0, 8(sp)
 ; RV32ZVBC-NEXT:    sw a1, 12(sp)
 ; RV32ZVBC-NEXT:    addi a0, sp, 8
-; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32ZVBC-NEXT:    vlse64.v v16, (a0), zero
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v16
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vlse64.v v12, (a0), zero
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v12
 ; RV32ZVBC-NEXT:    addi sp, sp, 16
 ; RV32ZVBC-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv8i64_vx:
+; RV64ZVBC-LABEL: clmulh_nxv4i64_vx:
 ; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
 ; RV64ZVBC-NEXT:    ret
-  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i128 0
-  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-  %va.ext = zext <vscale x 8 x i64> %va to <vscale x 8 x i128>
-  %vb.ext = zext <vscale x 8 x i64> %vb to <vscale x 8 x i128>
-  %clmul = call <vscale x 8 x i128> @llvm.clmul.nxv8i128(<vscale x 8 x i128> %va.ext, <vscale x 8 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 8 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 8 x i128> %res.ext to <vscale x 8 x i64>
-  ret <vscale x 8 x i64> %res
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i128 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %va.ext = zext <vscale x 4 x i64> %va to <vscale x 4 x i128>
+  %vb.ext = zext <vscale x 4 x i64> %vb to <vscale x 4 x i128>
+  %clmul = call <vscale x 4 x i128> @llvm.clmul.nxv4i128(<vscale x 4 x i128> %va.ext, <vscale x 4 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 4 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 4 x i128> %res.ext to <vscale x 4 x i64>
+  ret <vscale x 4 x i64> %res
 }
 
-define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %mask) {
-; RV32V-LABEL: clmulh_nxv1i64_vv_mask:
+define <vscale x 8 x i64> @clmulh_nxv8i64_vv(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv8i64_vv:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -352
-; RV32V-NEXT:    .cfi_def_cfa_offset 352
 ; RV32V-NEXT:    sw ra, 348(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s0, 344(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s1, 340(sp) # 4-byte Folded Spill
@@ -50417,1011 +30404,3254 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV32V-NEXT:    sw s9, 308(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s10, 304(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s11, 300(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    .cfi_offset ra, -4
-; RV32V-NEXT:    .cfi_offset s0, -8
-; RV32V-NEXT:    .cfi_offset s1, -12
-; RV32V-NEXT:    .cfi_offset s2, -16
-; RV32V-NEXT:    .cfi_offset s3, -20
-; RV32V-NEXT:    .cfi_offset s4, -24
-; RV32V-NEXT:    .cfi_offset s5, -28
-; RV32V-NEXT:    .cfi_offset s6, -32
-; RV32V-NEXT:    .cfi_offset s7, -36
-; RV32V-NEXT:    .cfi_offset s8, -40
-; RV32V-NEXT:    .cfi_offset s9, -44
-; RV32V-NEXT:    .cfi_offset s10, -48
-; RV32V-NEXT:    .cfi_offset s11, -52
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    sub sp, sp, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui t6, 16
+; RV32V-NEXT:    li t5, 56
+; RV32V-NEXT:    li t4, 40
+; RV32V-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vsrl.vx v24, v8, t5
+; RV32V-NEXT:    vsrl.vx v0, v8, t4
+; RV32V-NEXT:    addi t3, t6, -256
+; RV32V-NEXT:    vand.vx v0, v0, t3
+; RV32V-NEXT:    vor.vv v24, v0, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsrl.vx v24, v16, t4
+; RV32V-NEXT:    vand.vx v24, v24, t3
+; RV32V-NEXT:    vsrl.vx v0, v16, t5
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, t3
+; RV32V-NEXT:    vsll.vx v24, v24, t4
+; RV32V-NEXT:    vsll.vx v0, v8, t5
+; RV32V-NEXT:    vor.vv v8, v0, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v8, v16, t3
+; RV32V-NEXT:    vsll.vx v8, v8, t4
+; RV32V-NEXT:    vsll.vx v0, v16, t5
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a5, 1044480
+; RV32V-NEXT:    lui a4, 524288
+; RV32V-NEXT:    li ra, 1
+; RV32V-NEXT:    li a6, 2
+; RV32V-NEXT:    li a7, 4
+; RV32V-NEXT:    li s0, 8
+; RV32V-NEXT:    li s11, 16
+; RV32V-NEXT:    li s10, 32
+; RV32V-NEXT:    li s9, 64
+; RV32V-NEXT:    li s8, 128
+; RV32V-NEXT:    li s7, 256
+; RV32V-NEXT:    li s6, 512
+; RV32V-NEXT:    li s5, 1024
+; RV32V-NEXT:    lui s4, 1
+; RV32V-NEXT:    lui s3, 2
+; RV32V-NEXT:    lui s2, 4
+; RV32V-NEXT:    lui s1, 8
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    lui a1, 64
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    lui a3, 256
+; RV32V-NEXT:    lui t1, 512
+; RV32V-NEXT:    lui t0, 1024
+; RV32V-NEXT:    lui t2, 2048
+; RV32V-NEXT:    sw a5, 248(sp)
+; RV32V-NEXT:    lui a5, 4096
+; RV32V-NEXT:    sw zero, 252(sp)
+; RV32V-NEXT:    sw a4, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw ra, 276(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw a6, 260(sp)
+; RV32V-NEXT:    lui a6, 8192
+; RV32V-NEXT:    sw zero, 264(sp)
+; RV32V-NEXT:    sw a7, 268(sp)
+; RV32V-NEXT:    lui a7, 16384
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw s0, 244(sp)
+; RV32V-NEXT:    lui s0, 32768
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw s11, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s10, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s8, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s7, 204(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s6, 196(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s5, 188(sp)
+; RV32V-NEXT:    slli ra, ra, 11
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw ra, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw s4, 172(sp)
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw s3, 164(sp)
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw s2, 156(sp)
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw s1, 148(sp)
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw t6, 140(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw a0, 132(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw a1, 124(sp)
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw a2, 116(sp)
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw a3, 108(sp)
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw t1, 100(sp)
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw t0, 92(sp)
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw t2, 84(sp)
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a5, 76(sp)
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a6, 68(sp)
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a7, 60(sp)
+; RV32V-NEXT:    lui t2, 16384
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw s0, 52(sp)
+; RV32V-NEXT:    lui a7, 65536
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw a7, 44(sp)
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw a6, 36(sp)
+; RV32V-NEXT:    lui a5, 262144
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw a5, 28(sp)
+; RV32V-NEXT:    sw a4, 20(sp)
+; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    lui a3, 4080
+; RV32V-NEXT:    addi t0, sp, 248
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vsrl.vi v8, v8, 24
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vsrl.vi v0, v0, 8
+; RV32V-NEXT:    vand.vv v0, v0, v24
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsrl.vi v0, v16, 24
+; RV32V-NEXT:    vand.vx v0, v0, a3
+; RV32V-NEXT:    vsrl.vi v24, v16, 8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v24, v16
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vx v0, v16, a3
+; RV32V-NEXT:    vsll.vi v0, v0, 24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v16, v16, v8
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    vor.vv v0, v0, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vx v24, v8, a3
+; RV32V-NEXT:    vsll.vi v24, v24, 24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v16, v8
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    lui t0, 61681
+; RV32V-NEXT:    addi t0, t0, -241
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v16, v8, v16
+; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v0, t0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v8, v24, 4
+; RV32V-NEXT:    vand.vv v24, v24, v0
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vsll.vi v24, v24, 4
+; RV32V-NEXT:    vor.vv v8, v8, v24
+; RV32V-NEXT:    vsrl.vi v24, v16, 4
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vand.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vi v16, v16, 4
+; RV32V-NEXT:    vor.vv v16, v24, v16
+; RV32V-NEXT:    lui t0, 209715
+; RV32V-NEXT:    addi t0, t0, 819
+; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v0, t0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v24, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    vsrl.vi v24, v16, 2
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vand.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vi v16, v16, 2
+; RV32V-NEXT:    vor.vv v24, v24, v16
+; RV32V-NEXT:    lui t0, 349525
+; RV32V-NEXT:    addi t0, t0, 1365
+; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v0, t0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v16, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v24, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v24, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi t0, sp, 8
+; RV32V-NEXT:    vlse64.v v0, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vadd.vv v24, v24, v24
+; RV32V-NEXT:    vor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi t0, sp, 272
+; RV32V-NEXT:    addi t1, sp, 256
+; RV32V-NEXT:    addi a1, sp, 264
+; RV32V-NEXT:    addi a0, sp, 240
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a4, vlenb
+; RV32V-NEXT:    slli a4, a4, 4
+; RV32V-NEXT:    mv t0, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add t0, t0, a4
+; RV32V-NEXT:    slli a4, a4, 1
+; RV32V-NEXT:    add t0, t0, a4
+; RV32V-NEXT:    slli a4, a4, 3
+; RV32V-NEXT:    add a4, a4, t0
+; RV32V-NEXT:    add a4, sp, a4
+; RV32V-NEXT:    addi a4, a4, 288
+; RV32V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (t1), zero
+; RV32V-NEXT:    vlse64.v v24, (a1), zero
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 5
+; RV32V-NEXT:    mv a4, a1
+; RV32V-NEXT:    slli a1, a1, 1
+; RV32V-NEXT:    add a4, a4, a1
+; RV32V-NEXT:    slli a1, a1, 3
+; RV32V-NEXT:    add a1, a1, a4
+; RV32V-NEXT:    add a1, sp, a1
+; RV32V-NEXT:    addi a1, a1, 288
+; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 232
+; RV32V-NEXT:    addi a1, sp, 224
+; RV32V-NEXT:    addi t0, sp, 216
+; RV32V-NEXT:    addi t1, sp, 208
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 200
+; RV32V-NEXT:    addi a1, sp, 192
+; RV32V-NEXT:    addi t0, sp, 184
+; RV32V-NEXT:    addi t1, sp, 176
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 168
+; RV32V-NEXT:    addi a1, sp, 160
+; RV32V-NEXT:    addi t0, sp, 152
+; RV32V-NEXT:    addi t1, sp, 144
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 136
+; RV32V-NEXT:    addi a1, sp, 128
+; RV32V-NEXT:    addi t0, sp, 120
+; RV32V-NEXT:    addi t1, sp, 112
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 104
+; RV32V-NEXT:    addi a1, sp, 96
+; RV32V-NEXT:    addi t0, sp, 88
+; RV32V-NEXT:    addi t1, sp, 80
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 72
+; RV32V-NEXT:    addi a1, sp, 64
+; RV32V-NEXT:    addi t0, sp, 56
+; RV32V-NEXT:    addi t1, sp, 48
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 40
+; RV32V-NEXT:    addi a1, sp, 32
+; RV32V-NEXT:    addi t0, sp, 24
+; RV32V-NEXT:    addi t1, sp, 16
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a4, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a4, a4, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s11
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s10
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s9
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, ra
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, t6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, t2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a5
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v24, v8, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v0, v8, 1
+; RV32V-NEXT:    vand.vi v24, v8, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v8, v8, 8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v24, v16, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    sub sp, sp, a0
-; RV32V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xe0, 0x02, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 352 + 41 * vlenb
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    lui s7, 1044480
-; RV32V-NEXT:    lui a7, 524288
-; RV32V-NEXT:    li s11, 1
-; RV32V-NEXT:    li s8, 2
-; RV32V-NEXT:    li s9, 4
-; RV32V-NEXT:    li s10, 8
-; RV32V-NEXT:    li a3, 16
-; RV32V-NEXT:    li a4, 32
-; RV32V-NEXT:    li a5, 64
-; RV32V-NEXT:    li a6, 128
-; RV32V-NEXT:    li ra, 256
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    li a1, 1024
-; RV32V-NEXT:    lui a2, 1
-; RV32V-NEXT:    lui t0, 2
-; RV32V-NEXT:    lui t1, 4
-; RV32V-NEXT:    lui t2, 8
-; RV32V-NEXT:    lui t3, 16
-; RV32V-NEXT:    lui t4, 32
-; RV32V-NEXT:    lui t5, 64
-; RV32V-NEXT:    lui t6, 128
-; RV32V-NEXT:    lui s0, 256
-; RV32V-NEXT:    lui s1, 512
-; RV32V-NEXT:    lui s2, 1024
-; RV32V-NEXT:    lui s3, 2048
-; RV32V-NEXT:    lui s4, 4096
-; RV32V-NEXT:    lui s5, 8192
-; RV32V-NEXT:    lui s6, 16384
-; RV32V-NEXT:    sw s7, 272(sp)
-; RV32V-NEXT:    lui s7, 32768
-; RV32V-NEXT:    sw zero, 276(sp)
-; RV32V-NEXT:    sw a7, 264(sp)
-; RV32V-NEXT:    sw zero, 268(sp)
-; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s11, 260(sp)
-; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s8, 252(sp)
-; RV32V-NEXT:    lui s8, 65536
-; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    sw s9, 244(sp)
-; RV32V-NEXT:    lui s9, 131072
-; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s10, 236(sp)
-; RV32V-NEXT:    lui s10, 262144
-; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw a3, 228(sp)
-; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw a4, 220(sp)
-; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw a5, 212(sp)
-; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw a6, 204(sp)
-; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw ra, 196(sp)
-; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw a0, 188(sp)
-; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw a1, 180(sp)
-; RV32V-NEXT:    slli s11, s11, 11
-; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw s11, 172(sp)
-; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw a2, 164(sp)
-; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t0, 156(sp)
-; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t1, 148(sp)
-; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t2, 140(sp)
-; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t3, 132(sp)
-; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw t4, 124(sp)
-; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw t5, 116(sp)
-; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw t6, 108(sp)
-; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw s0, 100(sp)
-; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw s1, 92(sp)
-; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw s2, 84(sp)
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw s3, 76(sp)
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw s4, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s5, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw s6, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s7, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw s8, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw s9, 28(sp)
-; RV32V-NEXT:    sw zero, 16(sp)
-; RV32V-NEXT:    sw s10, 20(sp)
-; RV32V-NEXT:    sw zero, 8(sp)
-; RV32V-NEXT:    sw a7, 12(sp)
-; RV32V-NEXT:    lui a0, 61681
-; RV32V-NEXT:    addi a0, a0, -241
-; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vmv.v.x v2, a0
-; RV32V-NEXT:    lui a0, 209715
-; RV32V-NEXT:    addi a0, a0, 819
-; RV32V-NEXT:    vmv.v.x v1, a0
-; RV32V-NEXT:    lui a0, 349525
-; RV32V-NEXT:    addi a0, a0, 1365
-; RV32V-NEXT:    vmv.v.x v0, a0
-; RV32V-NEXT:    addi a0, sp, 272
-; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV32V-NEXT:    vlse64.v v4, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 264
-; RV32V-NEXT:    vlse64.v v10, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 256
-; RV32V-NEXT:    vlse64.v v11, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 248
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 240
-; RV32V-NEXT:    vlse64.v v13, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 232
-; RV32V-NEXT:    vlse64.v v14, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 224
-; RV32V-NEXT:    vlse64.v v18, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 216
-; RV32V-NEXT:    vlse64.v v19, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 208
-; RV32V-NEXT:    vlse64.v v20, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 200
-; RV32V-NEXT:    vlse64.v v21, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 192
-; RV32V-NEXT:    vlse64.v v22, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 184
-; RV32V-NEXT:    vlse64.v v23, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 176
-; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 168
-; RV32V-NEXT:    vlse64.v v25, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 160
-; RV32V-NEXT:    vlse64.v v26, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 152
-; RV32V-NEXT:    vlse64.v v27, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 144
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 136
-; RV32V-NEXT:    vlse64.v v29, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 128
-; RV32V-NEXT:    vlse64.v v30, (a0), zero
-; RV32V-NEXT:    li a6, 56
-; RV32V-NEXT:    vmv1r.v v31, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 24
-; RV32V-NEXT:    vsrl.vi v15, v31, 8
-; RV32V-NEXT:    vsrl.vx v16, v31, a6
-; RV32V-NEXT:    li ra, 40
-; RV32V-NEXT:    vsrl.vx v17, v31, ra
-; RV32V-NEXT:    vsll.vx v7, v31, a6
-; RV32V-NEXT:    vsrl.vx v6, v9, a6
-; RV32V-NEXT:    vsrl.vx v5, v9, ra
-; RV32V-NEXT:    addi a4, t3, -256
-; RV32V-NEXT:    vand.vx v17, v17, a4
-; RV32V-NEXT:    vor.vv v16, v17, v16
-; RV32V-NEXT:    vsll.vx v17, v9, a6
-; RV32V-NEXT:    vand.vx v5, v5, a4
-; RV32V-NEXT:    vor.vv v6, v5, v6
-; RV32V-NEXT:    vand.vx v5, v31, a4
-; RV32V-NEXT:    vsll.vx v5, v5, ra
-; RV32V-NEXT:    vor.vv v7, v7, v5
-; RV32V-NEXT:    vand.vx v5, v9, a4
-; RV32V-NEXT:    vsll.vx v5, v5, ra
-; RV32V-NEXT:    vor.vv v5, v17, v5
-; RV32V-NEXT:    vsrl.vi v17, v9, 24
-; RV32V-NEXT:    lui a5, 4080
-; RV32V-NEXT:    vand.vx v8, v8, a5
-; RV32V-NEXT:    vand.vv v15, v15, v4
-; RV32V-NEXT:    vor.vv v8, v15, v8
-; RV32V-NEXT:    vsrl.vi v15, v9, 8
-; RV32V-NEXT:    vand.vx v17, v17, a5
-; RV32V-NEXT:    vand.vv v15, v15, v4
-; RV32V-NEXT:    vor.vv v17, v15, v17
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vx v15, v31, a5
-; RV32V-NEXT:    vsll.vi v15, v15, 24
-; RV32V-NEXT:    vor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v16, v31, v4
-; RV32V-NEXT:    vmv.v.v v31, v4
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vsll.vi v16, v16, 8
-; RV32V-NEXT:    vor.vv v4, v15, v16
-; RV32V-NEXT:    addi a3, sp, 120
-; RV32V-NEXT:    vlse64.v v15, (a3), zero
-; RV32V-NEXT:    vor.vv v6, v17, v6
-; RV32V-NEXT:    vand.vx v16, v9, a5
-; RV32V-NEXT:    vsll.vi v16, v16, 24
-; RV32V-NEXT:    vand.vv v9, v9, v31
-; RV32V-NEXT:    vsll.vi v9, v9, 8
-; RV32V-NEXT:    vor.vv v9, v16, v9
-; RV32V-NEXT:    addi a3, sp, 112
-; RV32V-NEXT:    vlse64.v v16, (a3), zero
-; RV32V-NEXT:    vor.vv v7, v7, v4
-; RV32V-NEXT:    addi a3, sp, 104
-; RV32V-NEXT:    vlse64.v v17, (a3), zero
-; RV32V-NEXT:    vor.vv v9, v5, v9
-; RV32V-NEXT:    addi a3, sp, 96
-; RV32V-NEXT:    vlse64.v v4, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v7, v8
-; RV32V-NEXT:    addi a3, sp, 88
-; RV32V-NEXT:    vlse64.v v3, (a3), zero
-; RV32V-NEXT:    vor.vv v9, v9, v6
-; RV32V-NEXT:    vsrl.vi v7, v8, 4
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v2
-; RV32V-NEXT:    vand.vv v7, v7, v2
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v7, v8
-; RV32V-NEXT:    vsrl.vi v7, v9, 4
-; RV32V-NEXT:    vand.vv v9, v9, v2
-; RV32V-NEXT:    vand.vv v7, v7, v2
-; RV32V-NEXT:    vsll.vi v9, v9, 4
-; RV32V-NEXT:    vor.vv v9, v7, v9
-; RV32V-NEXT:    vsrl.vi v7, v8, 2
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v1
-; RV32V-NEXT:    vand.vv v7, v7, v1
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v7, v8
-; RV32V-NEXT:    vsrl.vi v7, v9, 2
-; RV32V-NEXT:    vand.vv v9, v9, v1
-; RV32V-NEXT:    vand.vv v7, v7, v1
-; RV32V-NEXT:    vsll.vi v9, v9, 2
-; RV32V-NEXT:    vor.vv v7, v7, v9
-; RV32V-NEXT:    vsrl.vi v9, v8, 1
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v9, v9, v0
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v9, v9, v8
-; RV32V-NEXT:    vsrl.vi v8, v7, 1
-; RV32V-NEXT:    vand.vv v7, v7, v0
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vadd.vv v7, v7, v7
-; RV32V-NEXT:    vor.vv v8, v8, v7
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    vlse64.v v2, (a3), zero
-; RV32V-NEXT:    vand.vv v10, v8, v10
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v13
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v14
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v18
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v19
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v20
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v21
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v22
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v23
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v24
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v25
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v26
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v27
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v28
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v29
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v30
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v15
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v16
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v17
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 72
-; RV32V-NEXT:    vlse64.v v10, (a3), zero
-; RV32V-NEXT:    vand.vv v11, v8, v4
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v11, v8, v3
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v11, v8, v2
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v10
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 64
-; RV32V-NEXT:    addi a2, sp, 56
-; RV32V-NEXT:    addi a1, sp, 48
-; RV32V-NEXT:    addi a0, sp, 40
-; RV32V-NEXT:    vlse64.v v10, (a3), zero
-; RV32V-NEXT:    vlse64.v v11, (a2), zero
-; RV32V-NEXT:    vlse64.v v12, (a1), zero
-; RV32V-NEXT:    vlse64.v v13, (a0), zero
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v13
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 32
-; RV32V-NEXT:    addi a1, sp, 24
-; RV32V-NEXT:    addi a2, sp, 16
-; RV32V-NEXT:    addi a3, sp, 8
-; RV32V-NEXT:    vlse64.v v10, (a0), zero
-; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    vlse64.v v12, (a2), zero
-; RV32V-NEXT:    vlse64.v v13, (a3), zero
-; RV32V-NEXT:    vand.vv v10, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v13
-; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vx v2, v8, a0
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v1, v8, a0
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v0, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vand.vx v13, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v14, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v15, v8, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v16, v8, a0
-; RV32V-NEXT:    vand.vx v17, v8, s11
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vand.vx v18, v8, a0
-; RV32V-NEXT:    vand.vx v19, v8, t0
-; RV32V-NEXT:    vand.vx v20, v8, t1
-; RV32V-NEXT:    vand.vx v21, v8, t2
-; RV32V-NEXT:    vand.vx v22, v8, t3
-; RV32V-NEXT:    vand.vx v23, v8, t4
-; RV32V-NEXT:    vand.vx v24, v8, t5
-; RV32V-NEXT:    vand.vx v25, v8, t6
-; RV32V-NEXT:    vand.vx v26, v8, s0
-; RV32V-NEXT:    vand.vx v27, v8, s1
-; RV32V-NEXT:    vand.vx v28, v8, s2
-; RV32V-NEXT:    vand.vx v29, v8, s3
-; RV32V-NEXT:    vand.vx v30, v8, s4
-; RV32V-NEXT:    vand.vx v31, v8, s5
-; RV32V-NEXT:    vand.vx v7, v8, s6
-; RV32V-NEXT:    vand.vx v6, v8, s7
-; RV32V-NEXT:    vand.vx v5, v8, s8
-; RV32V-NEXT:    vand.vx v4, v8, s9
-; RV32V-NEXT:    vand.vx v3, v8, s10
-; RV32V-NEXT:    vand.vi v10, v8, 2
-; RV32V-NEXT:    vand.vi v11, v8, 1
-; RV32V-NEXT:    vand.vi v12, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v11, v9, v11
-; RV32V-NEXT:    vmul.vv v12, v9, v12
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v2, v9, v2
-; RV32V-NEXT:    vmul.vv v1, v9, v1
-; RV32V-NEXT:    vmul.vv v0, v9, v0
-; RV32V-NEXT:    vmul.vv v13, v9, v13
-; RV32V-NEXT:    vmul.vv v14, v9, v14
-; RV32V-NEXT:    vmul.vv v15, v9, v15
-; RV32V-NEXT:    vmul.vv v16, v9, v16
-; RV32V-NEXT:    vmul.vv v17, v9, v17
-; RV32V-NEXT:    vmul.vv v18, v9, v18
-; RV32V-NEXT:    vmul.vv v19, v9, v19
-; RV32V-NEXT:    vmul.vv v20, v9, v20
-; RV32V-NEXT:    vmul.vv v21, v9, v21
-; RV32V-NEXT:    vmul.vv v22, v9, v22
-; RV32V-NEXT:    vmul.vv v23, v9, v23
-; RV32V-NEXT:    vmul.vv v24, v9, v24
-; RV32V-NEXT:    vmul.vv v25, v9, v25
-; RV32V-NEXT:    vmul.vv v26, v9, v26
-; RV32V-NEXT:    vmul.vv v27, v9, v27
-; RV32V-NEXT:    vmul.vv v28, v9, v28
-; RV32V-NEXT:    vmul.vv v29, v9, v29
-; RV32V-NEXT:    vmul.vv v30, v9, v30
-; RV32V-NEXT:    vmul.vv v31, v9, v31
-; RV32V-NEXT:    vmul.vv v7, v9, v7
-; RV32V-NEXT:    vmul.vv v6, v9, v6
-; RV32V-NEXT:    vmul.vv v5, v9, v5
-; RV32V-NEXT:    vmul.vv v4, v9, v4
-; RV32V-NEXT:    vmul.vv v3, v9, v3
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v10, v9, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 7
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -51429,48 +33659,47 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 7
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -51478,15 +33707,10 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
@@ -51494,254 +33718,284 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vx v8, v8, t5
+; RV32V-NEXT:    vand.vx v16, v16, t3
+; RV32V-NEXT:    vsll.vx v16, v16, t4
+; RV32V-NEXT:    vor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v9, v8
-; RV32V-NEXT:    vxor.vi v11, v11, 0
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v11, v11, v8
-; RV32V-NEXT:    vxor.vv v11, v11, v12
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v11, v8
-; RV32V-NEXT:    vxor.vv v8, v8, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v1
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v13
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v15
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v17
-; RV32V-NEXT:    vxor.vv v8, v8, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v19
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v21
-; RV32V-NEXT:    vxor.vv v8, v8, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v23
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v25
-; RV32V-NEXT:    vxor.vv v8, v8, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v27
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v29
-; RV32V-NEXT:    vxor.vv v8, v8, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v31
-; RV32V-NEXT:    vxor.vv v8, v8, v7
-; RV32V-NEXT:    vxor.vv v8, v8, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v5
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v3
-; RV32V-NEXT:    vxor.vv v8, v8, v10
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -51749,246 +34003,294 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 288
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v0, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v0, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsrl.vi v0, v24, 8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v0, v0, v16
+; RV32V-NEXT:    vsrl.vi v8, v8, 24
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    vor.vv v8, v0, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    vand.vx v24, v24, a3
+; RV32V-NEXT:    vsll.vi v24, v24, 24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v0, v8, v0
+; RV32V-NEXT:    vsll.vi v0, v0, 8
+; RV32V-NEXT:    vor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v16, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v0, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v10
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vsrl.vx v9, v8, a6
-; RV32V-NEXT:    vsll.vx v10, v8, a6
-; RV32V-NEXT:    vsrl.vx v11, v8, ra
-; RV32V-NEXT:    vand.vx v12, v8, a4
-; RV32V-NEXT:    vand.vx v11, v11, a4
-; RV32V-NEXT:    vsrl.vi v13, v8, 24
-; RV32V-NEXT:    vand.vx v14, v8, a5
-; RV32V-NEXT:    vand.vx v13, v13, a5
-; RV32V-NEXT:    vsll.vx v12, v12, ra
-; RV32V-NEXT:    vsrl.vi v15, v8, 8
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -51996,87 +34298,87 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v15, v15, v16
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v11, v15, v13
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vsll.vi v13, v14, 24
-; RV32V-NEXT:    vor.vv v8, v13, v8
-; RV32V-NEXT:    vor.vv v10, v10, v12
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vor.vv v8, v8, v9
-; RV32V-NEXT:    vsrl.vi v9, v8, 4
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 2
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vsrl.vx v8, v8, t4
+; RV32V-NEXT:    vand.vx v8, v8, t3
+; RV32V-NEXT:    vsrl.vx v24, v24, t5
+; RV32V-NEXT:    vor.vv v8, v8, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 1
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
-; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v9, v9, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 7
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 288
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vsrl.vi v8, v9, 1, v0.t
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    .cfi_def_cfa sp, 352
 ; RV32V-NEXT:    lw ra, 348(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    lw s0, 344(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    lw s1, 340(sp) # 4-byte Folded Reload
@@ -52090,54 +34392,1274 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV32V-NEXT:    lw s9, 308(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    lw s10, 304(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    lw s11, 300(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    .cfi_restore ra
-; RV32V-NEXT:    .cfi_restore s0
-; RV32V-NEXT:    .cfi_restore s1
-; RV32V-NEXT:    .cfi_restore s2
-; RV32V-NEXT:    .cfi_restore s3
-; RV32V-NEXT:    .cfi_restore s4
-; RV32V-NEXT:    .cfi_restore s5
-; RV32V-NEXT:    .cfi_restore s6
-; RV32V-NEXT:    .cfi_restore s7
-; RV32V-NEXT:    .cfi_restore s8
-; RV32V-NEXT:    .cfi_restore s9
-; RV32V-NEXT:    .cfi_restore s10
-; RV32V-NEXT:    .cfi_restore s11
 ; RV32V-NEXT:    addi sp, sp, 352
-; RV32V-NEXT:    .cfi_def_cfa_offset 0
 ; RV32V-NEXT:    ret
 ;
-; RV64V-LABEL: clmulh_nxv1i64_vv_mask:
+; RV64V-LABEL: clmulh_nxv8i64_vv:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi sp, sp, -224
-; RV64V-NEXT:    .cfi_def_cfa_offset 224
-; RV64V-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    .cfi_offset ra, -8
-; RV64V-NEXT:    .cfi_offset s0, -16
-; RV64V-NEXT:    .cfi_offset s1, -24
-; RV64V-NEXT:    .cfi_offset s2, -32
-; RV64V-NEXT:    .cfi_offset s3, -40
-; RV64V-NEXT:    .cfi_offset s4, -48
-; RV64V-NEXT:    .cfi_offset s5, -56
-; RV64V-NEXT:    .cfi_offset s6, -64
-; RV64V-NEXT:    .cfi_offset s7, -72
-; RV64V-NEXT:    .cfi_offset s8, -80
-; RV64V-NEXT:    .cfi_offset s9, -88
-; RV64V-NEXT:    .cfi_offset s10, -96
-; RV64V-NEXT:    .cfi_offset s11, -104
+; RV64V-NEXT:    addi sp, sp, -416
+; RV64V-NEXT:    sd ra, 408(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 400(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s1, 392(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s2, 384(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s3, 376(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s4, 368(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s5, 360(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s6, 352(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s7, 344(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s8, 336(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 328(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s10, 320(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s11, 312(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    li a1, 56
+; RV64V-NEXT:    li a2, 40
+; RV64V-NEXT:    lui a3, 16
+; RV64V-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vsrl.vx v24, v8, a1
+; RV64V-NEXT:    vsrl.vx v0, v8, a2
+; RV64V-NEXT:    addi a2, a3, -256
+; RV64V-NEXT:    vand.vx v0, v0, a2
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vsrl.vi v24, v8, 24
+; RV64V-NEXT:    lui a1, 4080
+; RV64V-NEXT:    li s4, 255
+; RV64V-NEXT:    vand.vx v24, v24, a1
+; RV64V-NEXT:    slli s4, s4, 24
+; RV64V-NEXT:    vsrl.vi v0, v8, 8
+; RV64V-NEXT:    vand.vx v0, v0, s4
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    li a1, 40
+; RV64V-NEXT:    vsrl.vx v24, v16, a1
+; RV64V-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v24, v24, a2
+; RV64V-NEXT:    li a0, 56
+; RV64V-NEXT:    vsrl.vx v0, v16, a0
+; RV64V-NEXT:    vor.vv v24, v24, v0
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vsrl.vi v24, v16, 24
+; RV64V-NEXT:    lui a3, 4080
+; RV64V-NEXT:    vand.vx v24, v24, a3
+; RV64V-NEXT:    vsrl.vi v0, v16, 8
+; RV64V-NEXT:    vand.vx v0, v0, s4
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v0, v8, a3
+; RV64V-NEXT:    vsll.vi v0, v0, 24
+; RV64V-NEXT:    vand.vx v24, v8, s4
+; RV64V-NEXT:    sd s4, 288(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vsll.vi v24, v24, 8
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 7
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vsll.vx v24, v8, a0
+; RV64V-NEXT:    li a0, 56
+; RV64V-NEXT:    vand.vx v8, v8, a2
+; RV64V-NEXT:    vsll.vx v8, v8, a1
+; RV64V-NEXT:    vor.vv v8, v24, v8
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 1
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vl8r.v v0, (a4) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 4
+; RV64V-NEXT:    mv a5, a4
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a5, a5, a4
+; RV64V-NEXT:    slli a4, a4, 2
+; RV64V-NEXT:    add a4, a4, a5
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 304
+; RV64V-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v24, v16, a3
+; RV64V-NEXT:    vsll.vi v24, v24, 24
+; RV64V-NEXT:    vand.vx v0, v16, s4
+; RV64V-NEXT:    vsll.vi v0, v0, 8
+; RV64V-NEXT:    vor.vv v24, v24, v0
+; RV64V-NEXT:    vsll.vx v0, v16, a0
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vsll.vx v16, v16, a1
+; RV64V-NEXT:    vor.vv v16, v0, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v8, v8, v0
+; RV64V-NEXT:    vor.vv v16, v16, v24
+; RV64V-NEXT:    lui a0, 61681
+; RV64V-NEXT:    lui a1, 209715
+; RV64V-NEXT:    lui a2, 349525
+; RV64V-NEXT:    li a4, 16
+; RV64V-NEXT:    li a3, 32
+; RV64V-NEXT:    li t2, 1
+; RV64V-NEXT:    addi a7, a0, -241
+; RV64V-NEXT:    addi t0, a1, 819
+; RV64V-NEXT:    addi t1, a2, 1365
+; RV64V-NEXT:    slli a0, t2, 11
+; RV64V-NEXT:    sd a0, 248(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 31
+; RV64V-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 32
+; RV64V-NEXT:    sd a0, 232(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 33
+; RV64V-NEXT:    sd a0, 224(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 34
+; RV64V-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 35
+; RV64V-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 36
+; RV64V-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 37
+; RV64V-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 38
+; RV64V-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 39
+; RV64V-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 40
+; RV64V-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 41
+; RV64V-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 42
+; RV64V-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 43
+; RV64V-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 44
+; RV64V-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, a7, 32
+; RV64V-NEXT:    add a7, a7, a0
+; RV64V-NEXT:    slli a0, t0, 32
+; RV64V-NEXT:    add t0, t0, a0
+; RV64V-NEXT:    slli a0, t1, 32
+; RV64V-NEXT:    add a0, t1, a0
+; RV64V-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a1, t2, 45
+; RV64V-NEXT:    sd a1, 128(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 304
+; RV64V-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v16, v16, v24
+; RV64V-NEXT:    vsrl.vi v24, v16, 4
+; RV64V-NEXT:    vand.vx v16, v16, a7
+; RV64V-NEXT:    vand.vx v24, v24, a7
+; RV64V-NEXT:    sd a7, 256(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vsll.vi v16, v16, 4
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vsrl.vi v24, v16, 2
+; RV64V-NEXT:    vand.vx v16, v16, t0
+; RV64V-NEXT:    vand.vx v24, v24, t0
+; RV64V-NEXT:    sd t0, 264(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vsll.vi v16, v16, 2
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vsrl.vi v24, v16, 1
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vand.vx v24, v24, a0
+; RV64V-NEXT:    vadd.vv v16, v16, v16
+; RV64V-NEXT:    vor.vv v0, v24, v16
+; RV64V-NEXT:    vand.vx v16, v0, a4
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 4
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 304
+; RV64V-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    slli a1, t2, 46
+; RV64V-NEXT:    sd a1, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 3
+; RV64V-NEXT:    add a2, a2, a1
+; RV64V-NEXT:    slli a1, a1, 2
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 304
+; RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a7
+; RV64V-NEXT:    vand.vx v16, v16, a7
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, t0
+; RV64V-NEXT:    vand.vx v16, v16, t0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vand.vx v16, v0, a3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 47
+; RV64V-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 48
+; RV64V-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 49
+; RV64V-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 50
+; RV64V-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 51
+; RV64V-NEXT:    sd a0, 80(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 52
+; RV64V-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 53
+; RV64V-NEXT:    sd a0, 64(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 54
+; RV64V-NEXT:    sd a0, 56(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 55
+; RV64V-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 56
+; RV64V-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli a0, t2, 57
+; RV64V-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli ra, t2, 58
+; RV64V-NEXT:    slli s10, t2, 59
+; RV64V-NEXT:    slli s8, t2, 60
+; RV64V-NEXT:    slli a0, t2, 61
+; RV64V-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    slli s11, t2, 62
+; RV64V-NEXT:    li a1, -1
+; RV64V-NEXT:    slli s9, a1, 63
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    li a1, 128
+; RV64V-NEXT:    li a2, 256
+; RV64V-NEXT:    li a3, 512
+; RV64V-NEXT:    li a4, 1024
+; RV64V-NEXT:    lui a5, 1
+; RV64V-NEXT:    lui a6, 2
+; RV64V-NEXT:    lui a7, 4
+; RV64V-NEXT:    lui t0, 8
+; RV64V-NEXT:    lui t1, 32
+; RV64V-NEXT:    lui t2, 64
+; RV64V-NEXT:    lui t3, 128
+; RV64V-NEXT:    lui t4, 256
+; RV64V-NEXT:    lui t5, 512
+; RV64V-NEXT:    lui t6, 1024
+; RV64V-NEXT:    lui s0, 2048
+; RV64V-NEXT:    lui s1, 4096
+; RV64V-NEXT:    lui s2, 8192
+; RV64V-NEXT:    lui s3, 16384
+; RV64V-NEXT:    lui s4, 32768
+; RV64V-NEXT:    lui s5, 65536
+; RV64V-NEXT:    lui s6, 131072
+; RV64V-NEXT:    lui s7, 262144
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    sd s8, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    mv s8, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add s8, s8, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, s8
+; RV64V-NEXT:    ld s8, 8(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a4
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a5
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, a7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t4
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t5
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, t6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 9
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s4
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s5
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, ra
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 2
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v24, v0, 1
+; RV64V-NEXT:    vand.vi v16, v0, 4
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vi v16, v0, 8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v0, a0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s11
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vand.vx v16, v0, s9
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v16, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v24, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -52145,877 +35667,735 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    sub sp, sp, a0
-; RV64V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0x2f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 224 + 47 * vlenb
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV64V-NEXT:    vmv1r.v v18, v8
-; RV64V-NEXT:    li a3, 56
-; RV64V-NEXT:    lui a2, 16
-; RV64V-NEXT:    vsrl.vi v16, v8, 24
-; RV64V-NEXT:    vsrl.vi v10, v8, 8
-; RV64V-NEXT:    li t5, 255
-; RV64V-NEXT:    lui a0, 61681
-; RV64V-NEXT:    lui a1, 209715
-; RV64V-NEXT:    lui a5, 349525
-; RV64V-NEXT:    vsrl.vi v12, v9, 24
-; RV64V-NEXT:    vsrl.vi v11, v9, 8
-; RV64V-NEXT:    li ra, 16
-; RV64V-NEXT:    li s11, 32
-; RV64V-NEXT:    li s10, 64
-; RV64V-NEXT:    li s8, 128
-; RV64V-NEXT:    li s9, 256
-; RV64V-NEXT:    li s7, 512
-; RV64V-NEXT:    li s6, 1024
-; RV64V-NEXT:    li t0, 1
-; RV64V-NEXT:    lui s5, 1
-; RV64V-NEXT:    lui a6, 2
-; RV64V-NEXT:    lui a7, 4
-; RV64V-NEXT:    lui t1, 8
-; RV64V-NEXT:    lui t2, 32
-; RV64V-NEXT:    lui t3, 64
-; RV64V-NEXT:    lui t4, 128
-; RV64V-NEXT:    lui s3, 256
-; RV64V-NEXT:    lui s4, 512
-; RV64V-NEXT:    addi s0, a0, -241
-; RV64V-NEXT:    addi s1, a1, 819
-; RV64V-NEXT:    addi s2, a5, 1365
-; RV64V-NEXT:    slli a0, s0, 32
-; RV64V-NEXT:    add s0, s0, a0
-; RV64V-NEXT:    slli a0, s1, 32
-; RV64V-NEXT:    add s1, s1, a0
-; RV64V-NEXT:    slli a0, s2, 32
-; RV64V-NEXT:    add s2, s2, a0
-; RV64V-NEXT:    addi t6, a2, -256
-; RV64V-NEXT:    slli t5, t5, 24
-; RV64V-NEXT:    vsrl.vx v8, v9, a3
-; RV64V-NEXT:    li a0, 40
-; RV64V-NEXT:    vsrl.vx v13, v9, a0
-; RV64V-NEXT:    lui a1, 4080
-; RV64V-NEXT:    vand.vx v12, v12, a1
-; RV64V-NEXT:    vand.vx v14, v9, a1
-; RV64V-NEXT:    vsll.vx v15, v9, a3
-; RV64V-NEXT:    vand.vx v13, v13, t6
-; RV64V-NEXT:    vand.vx v11, v11, t5
-; RV64V-NEXT:    vsll.vi v14, v14, 24
-; RV64V-NEXT:    vand.vx v17, v9, t5
-; RV64V-NEXT:    vand.vx v9, v9, t6
-; RV64V-NEXT:    vor.vv v8, v13, v8
-; RV64V-NEXT:    vor.vv v11, v11, v12
-; RV64V-NEXT:    vsll.vi v12, v17, 8
-; RV64V-NEXT:    vsll.vx v9, v9, a0
-; RV64V-NEXT:    li a4, 40
-; RV64V-NEXT:    vor.vv v8, v11, v8
-; RV64V-NEXT:    vor.vv v11, v14, v12
-; RV64V-NEXT:    vor.vv v9, v15, v9
-; RV64V-NEXT:    vor.vv v9, v9, v11
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, s0
-; RV64V-NEXT:    vand.vx v9, v9, s0
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, s1
-; RV64V-NEXT:    vand.vx v9, v9, s1
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, s2
-; RV64V-NEXT:    vand.vx v9, v9, s2
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v11, v9, v8
-; RV64V-NEXT:    vand.vx v13, v11, ra
-; RV64V-NEXT:    lui a0, 4096
-; RV64V-NEXT:    vand.vx v14, v11, s11
-; RV64V-NEXT:    lui a1, 8192
-; RV64V-NEXT:    vand.vx v15, v11, s10
-; RV64V-NEXT:    lui a3, 16384
-; RV64V-NEXT:    vand.vx v17, v11, s8
-; RV64V-NEXT:    lui s8, 32768
-; RV64V-NEXT:    vand.vx v19, v11, s9
-; RV64V-NEXT:    lui s9, 65536
-; RV64V-NEXT:    vand.vx v20, v11, s7
-; RV64V-NEXT:    lui s11, 131072
-; RV64V-NEXT:    vand.vx v21, v11, s6
-; RV64V-NEXT:    slli a5, t0, 11
-; RV64V-NEXT:    vand.vx v22, v11, a5
-; RV64V-NEXT:    lui ra, 262144
-; RV64V-NEXT:    li a5, 56
-; RV64V-NEXT:    vsrl.vx v5, v18, a5
-; RV64V-NEXT:    vsrl.vx v1, v18, a4
-; RV64V-NEXT:    lui s6, 4080
-; RV64V-NEXT:    vand.vx v2, v16, s6
-; RV64V-NEXT:    vand.vx v8, v18, s6
-; RV64V-NEXT:    vsll.vx v4, v18, a5
-; RV64V-NEXT:    vand.vx v23, v11, s5
-; RV64V-NEXT:    slli s10, t0, 31
-; RV64V-NEXT:    vand.vx v24, v11, a6
-; RV64V-NEXT:    slli a5, t0, 32
-; RV64V-NEXT:    sd a5, 96(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v25, v11, a7
-; RV64V-NEXT:    slli a5, t0, 33
-; RV64V-NEXT:    sd a5, 88(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v26, v11, t1
-; RV64V-NEXT:    slli a5, t0, 34
-; RV64V-NEXT:    sd a5, 80(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v27, v11, a2
-; RV64V-NEXT:    slli a2, t0, 35
-; RV64V-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v28, v11, t2
-; RV64V-NEXT:    slli a2, t0, 36
-; RV64V-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v29, v11, t3
-; RV64V-NEXT:    slli a2, t0, 37
-; RV64V-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v30, v11, t4
-; RV64V-NEXT:    slli a2, t0, 38
-; RV64V-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v31, v11, s3
-; RV64V-NEXT:    slli a2, t0, 39
-; RV64V-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v7, v11, s4
-; RV64V-NEXT:    slli a2, t0, 40
-; RV64V-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    lui a2, 1024
-; RV64V-NEXT:    vand.vx v9, v11, a2
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    mv a5, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a5, a5, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a5
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli a2, t0, 41
-; RV64V-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
-; RV64V-NEXT:    lui a2, 2048
-; RV64V-NEXT:    vand.vx v9, v11, a2
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a5, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a5, a5, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a5, a5, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a5
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s4, t0, 42
-; RV64V-NEXT:    vand.vx v9, v11, a0
-; RV64V-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 9
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a2, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a2, a2, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a2
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s5, t0, 43
-; RV64V-NEXT:    vand.vx v9, v11, a1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s6, t0, 44
-; RV64V-NEXT:    vand.vx v9, v11, a3
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s7, t0, 45
-; RV64V-NEXT:    vand.vx v9, v1, t6
-; RV64V-NEXT:    vor.vv v9, v9, v5
-; RV64V-NEXT:    vand.vx v12, v11, s8
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    addi a0, sp, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s8, t0, 46
-; RV64V-NEXT:    vand.vx v10, v10, t5
-; RV64V-NEXT:    vor.vv v10, v10, v2
-; RV64V-NEXT:    vand.vx v12, v11, s9
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s9, t0, 47
-; RV64V-NEXT:    vsll.vi v8, v8, 24
-; RV64V-NEXT:    vor.vv v9, v10, v9
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v10, v18, t5
-; RV64V-NEXT:    vsll.vi v10, v10, 8
-; RV64V-NEXT:    vor.vv v8, v8, v10
-; RV64V-NEXT:    vand.vx v10, v18, t6
-; RV64V-NEXT:    vsll.vx v10, v10, a4
-; RV64V-NEXT:    vor.vv v10, v4, v10
-; RV64V-NEXT:    vand.vx v12, v11, s11
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli s11, t0, 48
-; RV64V-NEXT:    vor.vv v8, v10, v8
-; RV64V-NEXT:    vand.vx v10, v11, ra
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli ra, t0, 49
-; RV64V-NEXT:    vor.vv v8, v8, v9
-; RV64V-NEXT:    vsrl.vi v9, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, s0
-; RV64V-NEXT:    vand.vx v9, v9, s0
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, s1
-; RV64V-NEXT:    vand.vx v9, v9, s1
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, s2
-; RV64V-NEXT:    vand.vx v9, v9, s2
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v4, v9, v8
-; RV64V-NEXT:    vand.vx v8, v11, s10
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    slli t4, t0, 50
-; RV64V-NEXT:    slli t3, t0, 51
-; RV64V-NEXT:    slli t2, t0, 52
-; RV64V-NEXT:    slli s10, t0, 53
-; RV64V-NEXT:    slli t1, t0, 54
-; RV64V-NEXT:    slli a7, t0, 55
-; RV64V-NEXT:    slli a6, t0, 56
-; RV64V-NEXT:    slli a5, t0, 57
-; RV64V-NEXT:    slli a4, t0, 58
-; RV64V-NEXT:    slli a2, t0, 59
-; RV64V-NEXT:    slli a1, t0, 60
-; RV64V-NEXT:    slli a3, t0, 61
-; RV64V-NEXT:    slli t0, t0, 62
-; RV64V-NEXT:    li a0, -1
-; RV64V-NEXT:    slli a0, a0, 63
-; RV64V-NEXT:    ld s3, 96(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 4
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 88(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s4, s3, 5
-; RV64V-NEXT:    add s3, s4, s3
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 80(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 5
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s4, s3, 5
-; RV64V-NEXT:    sub s3, s4, s3
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 56(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 48(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 40(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 32(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    vand.vx v8, v11, s3
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 3
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s4
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 3
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s5
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s6
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s7
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s8
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 2
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s9
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    add s4, s4, s3
-; RV64V-NEXT:    slli s3, s3, 3
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s11
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s3, s3, 1
-; RV64V-NEXT:    mv s4, s3
-; RV64V-NEXT:    slli s3, s3, 3
-; RV64V-NEXT:    add s3, s3, s4
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, ra
-; RV64V-NEXT:    csrr s3, vlenb
-; RV64V-NEXT:    slli s4, s3, 4
-; RV64V-NEXT:    add s3, s4, s3
-; RV64V-NEXT:    add s3, sp, s3
-; RV64V-NEXT:    addi s3, s3, 112
-; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, t4
-; RV64V-NEXT:    csrr t4, vlenb
-; RV64V-NEXT:    slli t4, t4, 4
-; RV64V-NEXT:    add t4, sp, t4
-; RV64V-NEXT:    addi t4, t4, 112
-; RV64V-NEXT:    vs1r.v v8, (t4) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, t3
-; RV64V-NEXT:    csrr t3, vlenb
-; RV64V-NEXT:    slli t4, t3, 4
-; RV64V-NEXT:    sub t3, t4, t3
-; RV64V-NEXT:    add t3, sp, t3
-; RV64V-NEXT:    addi t3, t3, 112
-; RV64V-NEXT:    vs1r.v v8, (t3) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, t2
-; RV64V-NEXT:    csrr t2, vlenb
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    mv t3, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add t3, t3, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add t2, t2, t3
-; RV64V-NEXT:    add t2, sp, t2
-; RV64V-NEXT:    addi t2, t2, 112
-; RV64V-NEXT:    vs1r.v v8, (t2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, s10
-; RV64V-NEXT:    csrr t2, vlenb
-; RV64V-NEXT:    mv t3, t2
-; RV64V-NEXT:    slli t2, t2, 2
-; RV64V-NEXT:    add t3, t3, t2
-; RV64V-NEXT:    slli t2, t2, 1
-; RV64V-NEXT:    add t2, t2, t3
-; RV64V-NEXT:    add t2, sp, t2
-; RV64V-NEXT:    addi t2, t2, 112
-; RV64V-NEXT:    vs1r.v v8, (t2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, t1
-; RV64V-NEXT:    csrr t1, vlenb
-; RV64V-NEXT:    slli t1, t1, 2
-; RV64V-NEXT:    mv t2, t1
-; RV64V-NEXT:    slli t1, t1, 1
-; RV64V-NEXT:    add t1, t1, t2
-; RV64V-NEXT:    add t1, sp, t1
-; RV64V-NEXT:    addi t1, t1, 112
-; RV64V-NEXT:    vs1r.v v8, (t1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, a7
-; RV64V-NEXT:    csrr a7, vlenb
-; RV64V-NEXT:    mv t1, a7
-; RV64V-NEXT:    slli a7, a7, 1
-; RV64V-NEXT:    add t1, t1, a7
-; RV64V-NEXT:    slli a7, a7, 2
-; RV64V-NEXT:    add a7, a7, t1
-; RV64V-NEXT:    add a7, sp, a7
-; RV64V-NEXT:    addi a7, a7, 112
-; RV64V-NEXT:    vs1r.v v8, (a7) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, a6
-; RV64V-NEXT:    csrr a6, vlenb
-; RV64V-NEXT:    slli a6, a6, 1
-; RV64V-NEXT:    mv a7, a6
-; RV64V-NEXT:    slli a6, a6, 2
-; RV64V-NEXT:    add a6, a6, a7
-; RV64V-NEXT:    add a6, sp, a6
-; RV64V-NEXT:    addi a6, a6, 112
-; RV64V-NEXT:    vs1r.v v8, (a6) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, a5
-; RV64V-NEXT:    csrr a5, vlenb
-; RV64V-NEXT:    slli a6, a5, 3
-; RV64V-NEXT:    add a5, a6, a5
-; RV64V-NEXT:    add a5, sp, a5
-; RV64V-NEXT:    addi a5, a5, 112
-; RV64V-NEXT:    vs1r.v v8, (a5) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, a4
-; RV64V-NEXT:    csrr a4, vlenb
-; RV64V-NEXT:    slli a4, a4, 3
-; RV64V-NEXT:    add a4, sp, a4
-; RV64V-NEXT:    addi a4, a4, 112
-; RV64V-NEXT:    vs1r.v v8, (a4) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, a2
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a4, a2, 3
-; RV64V-NEXT:    sub a2, a4, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vs1r.v v8, (a2) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vx v8, v11, a1
-; RV64V-NEXT:    csrr a1, vlenb
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    mv a2, a1
-; RV64V-NEXT:    slli a1, a1, 1
-; RV64V-NEXT:    add a1, a1, a2
-; RV64V-NEXT:    add a1, sp, a1
-; RV64V-NEXT:    addi a1, a1, 112
-; RV64V-NEXT:    vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vand.vi v8, v11, 2
-; RV64V-NEXT:    vand.vi v9, v11, 1
-; RV64V-NEXT:    vand.vi v10, v11, 4
-; RV64V-NEXT:    vand.vi v12, v11, 8
-; RV64V-NEXT:    vand.vx v6, v11, a3
-; RV64V-NEXT:    vand.vx v5, v11, t0
-; RV64V-NEXT:    vand.vx v2, v11, a0
-; RV64V-NEXT:    vmul.vv v3, v4, v8
-; RV64V-NEXT:    vmul.vv v8, v4, v9
-; RV64V-NEXT:    vmul.vv v9, v4, v10
-; RV64V-NEXT:    vmul.vv v10, v4, v12
-; RV64V-NEXT:    vmul.vv v11, v4, v13
-; RV64V-NEXT:    vmul.vv v12, v4, v14
-; RV64V-NEXT:    vmul.vv v13, v4, v15
-; RV64V-NEXT:    vmul.vv v14, v4, v17
-; RV64V-NEXT:    vmul.vv v15, v4, v19
-; RV64V-NEXT:    vmul.vv v16, v4, v20
-; RV64V-NEXT:    vmul.vv v17, v4, v21
-; RV64V-NEXT:    vmul.vv v18, v4, v22
-; RV64V-NEXT:    vmul.vv v19, v4, v23
-; RV64V-NEXT:    vmul.vv v20, v4, v24
-; RV64V-NEXT:    vmul.vv v21, v4, v25
-; RV64V-NEXT:    vmul.vv v22, v4, v26
-; RV64V-NEXT:    vmul.vv v23, v4, v27
-; RV64V-NEXT:    vmul.vv v24, v4, v28
-; RV64V-NEXT:    vmul.vv v25, v4, v29
-; RV64V-NEXT:    vmul.vv v26, v4, v30
-; RV64V-NEXT:    vmul.vv v27, v4, v31
-; RV64V-NEXT:    vmul.vv v28, v4, v7
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v29, v4, v29
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v30, v4, v30
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 9
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v31, v4, v31
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v7, v4, v7
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 2
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 1
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v0
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v0
-; RV64V-NEXT:    addi a0, sp, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v0, v4, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    slli a0, a0, 7
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 5
-; RV64V-NEXT:    sub a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -53024,230 +36404,347 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 7
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 4
-; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 4
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 4
-; RV64V-NEXT:    add a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 4
-; RV64V-NEXT:    sub a0, a1, a0
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
-; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
@@ -53256,421 +36753,761 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 7
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v8, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    vxor.vv v8, v8, v0
+; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 3
-; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v8, v16
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v16, v8
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a1, a0, 3
-; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    slli a0, a0, 7
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    csrr a0, vlenb
 ; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV64V-NEXT:    vmul.vv v6, v4, v6
-; RV64V-NEXT:    vmul.vv v5, v4, v5
-; RV64V-NEXT:    vmul.vv v4, v4, v2
-; RV64V-NEXT:    vxor.vv v8, v8, v3
-; RV64V-NEXT:    vxor.vv v8, v8, v9
-; RV64V-NEXT:    vxor.vv v8, v8, v10
-; RV64V-NEXT:    vxor.vv v8, v8, v11
-; RV64V-NEXT:    vxor.vv v8, v8, v12
-; RV64V-NEXT:    vxor.vv v8, v8, v13
-; RV64V-NEXT:    vxor.vv v8, v8, v14
-; RV64V-NEXT:    vxor.vv v9, v8, v15
-; RV64V-NEXT:    vxor.vv v9, v9, v16
-; RV64V-NEXT:    vxor.vv v9, v9, v17
-; RV64V-NEXT:    vxor.vv v9, v9, v18
-; RV64V-NEXT:    vxor.vv v9, v9, v19
-; RV64V-NEXT:    vxor.vv v9, v9, v20
-; RV64V-NEXT:    vxor.vv v9, v9, v21
-; RV64V-NEXT:    vxor.vv v9, v9, v22
-; RV64V-NEXT:    vxor.vv v9, v9, v23
-; RV64V-NEXT:    vxor.vv v9, v9, v24
-; RV64V-NEXT:    vxor.vv v9, v9, v25
-; RV64V-NEXT:    vxor.vv v9, v9, v26
-; RV64V-NEXT:    vxor.vv v9, v9, v27
-; RV64V-NEXT:    vxor.vv v9, v9, v28
-; RV64V-NEXT:    vxor.vv v9, v9, v29
-; RV64V-NEXT:    vxor.vv v9, v9, v30
-; RV64V-NEXT:    vxor.vv v10, v9, v31
-; RV64V-NEXT:    vxor.vv v10, v10, v7
-; RV64V-NEXT:    li a1, 56
-; RV64V-NEXT:    vsll.vx v8, v8, a1
-; RV64V-NEXT:    vand.vx v9, v9, t6
-; RV64V-NEXT:    li a0, 40
-; RV64V-NEXT:    vsll.vx v9, v9, a0
-; RV64V-NEXT:    vor.vv v8, v8, v9
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 2
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v9, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v10, v9
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 1
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    addi a2, sp, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    vxor.vv v9, v9, v0
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 4
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 5
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 5
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 5
-; RV64V-NEXT:    sub a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v9, v10
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v10, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v10, v11
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a3, a2, 4
-; RV64V-NEXT:    add a2, a3, a2
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    vsrl.vi v12, v9, 8
-; RV64V-NEXT:    vand.vx v12, v12, t5
-; RV64V-NEXT:    vsrl.vi v10, v10, 24
-; RV64V-NEXT:    lui a2, 4080
-; RV64V-NEXT:    vand.vx v10, v10, a2
-; RV64V-NEXT:    vor.vv v10, v12, v10
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 6
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v24
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v8, v24
+; RV64V-NEXT:    addi a0, sp, 304
+; RV64V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    li a0, 56
+; RV64V-NEXT:    vsll.vx v16, v16, a0
+; RV64V-NEXT:    ld a1, 280(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    li a2, 40
+; RV64V-NEXT:    vsll.vx v8, v8, a2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 9
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 6
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v24, v8
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 5
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 7
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v0, v8, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v0, v0, v16
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v0, v16
+; RV64V-NEXT:    vsrl.vi v0, v24, 8
+; RV64V-NEXT:    ld a4, 288(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v0, v0, a4
+; RV64V-NEXT:    vsrl.vi v8, v8, 24
+; RV64V-NEXT:    lui a3, 4080
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vor.vv v8, v0, v8
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 5
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 2
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v8, v8, v16
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 6
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v8, v16
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a5, a5, 3
+; RV64V-NEXT:    mv a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 1
+; RV64V-NEXT:    add a6, a6, a5
+; RV64V-NEXT:    slli a5, a5, 4
+; RV64V-NEXT:    add a5, a5, a6
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 304
+; RV64V-NEXT:    vl8r.v v0, (a5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    vand.vx v24, v24, a3
+; RV64V-NEXT:    vsll.vi v24, v24, 24
+; RV64V-NEXT:    vand.vx v0, v8, a4
+; RV64V-NEXT:    vsll.vi v0, v0, 8
+; RV64V-NEXT:    vor.vv v24, v24, v0
 ; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
 ; RV64V-NEXT:    mv a4, a3
 ; RV64V-NEXT:    slli a3, a3, 1
 ; RV64V-NEXT:    add a4, a4, a3
@@ -53679,223 +37516,163 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscal
 ; RV64V-NEXT:    slli a3, a3, 3
 ; RV64V-NEXT:    add a3, a3, a4
 ; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 112
-; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v0, v16, v0
 ; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
 ; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v16, v16, v24
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
 ; RV64V-NEXT:    mv a4, a3
 ; RV64V-NEXT:    slli a3, a3, 1
 ; RV64V-NEXT:    add a4, a4, a3
 ; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
 ; RV64V-NEXT:    add a3, a3, a4
 ; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 112
-; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v0, v24
 ; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 4
 ; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 2
-; RV64V-NEXT:    add a4, a4, a3
 ; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
 ; RV64V-NEXT:    add a3, a3, a4
 ; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 112
-; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
 ; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a4, a4, a3
 ; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 7
 ; RV64V-NEXT:    mv a4, a3
-; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    slli a3, a3, 2
 ; RV64V-NEXT:    add a3, a3, a4
 ; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 112
-; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
 ; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 3
 ; RV64V-NEXT:    mv a4, a3
 ; RV64V-NEXT:    slli a3, a3, 1
 ; RV64V-NEXT:    add a4, a4, a3
-; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
 ; RV64V-NEXT:    add a3, a3, a4
 ; RV64V-NEXT:    add a3, sp, a3
-; RV64V-NEXT:    addi a3, a3, 112
-; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v11, v11, v12
-; RV64V-NEXT:    vand.vx v9, v9, a2
-; RV64V-NEXT:    vsll.vi v9, v9, 24
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v11, v12
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 3
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vand.vx v13, v11, t5
-; RV64V-NEXT:    vsll.vi v13, v13, 8
-; RV64V-NEXT:    vor.vv v9, v9, v13
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v12, v12, v13
-; RV64V-NEXT:    vor.vv v8, v8, v9
-; RV64V-NEXT:    csrr a2, vlenb
-; RV64V-NEXT:    mv a3, a2
-; RV64V-NEXT:    slli a2, a2, 1
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a3, a3, a2
-; RV64V-NEXT:    slli a2, a2, 2
-; RV64V-NEXT:    add a2, a2, a3
-; RV64V-NEXT:    add a2, sp, a2
-; RV64V-NEXT:    addi a2, a2, 112
-; RV64V-NEXT:    vl1r.v v9, (a2) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v12, v9
-; RV64V-NEXT:    vsrl.vx v11, v11, a0
-; RV64V-NEXT:    vand.vx v11, v11, t6
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    mv a2, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a2, a2, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a2
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vxor.vv v9, v9, v12
-; RV64V-NEXT:    vxor.vv v9, v9, v6
-; RV64V-NEXT:    vxor.vv v9, v9, v5
-; RV64V-NEXT:    vxor.vv v9, v9, v4
-; RV64V-NEXT:    vsrl.vx v9, v9, a1
-; RV64V-NEXT:    vor.vv v9, v11, v9
-; RV64V-NEXT:    vor.vv v9, v10, v9
-; RV64V-NEXT:    vor.vv v8, v8, v9
-; RV64V-NEXT:    vsrl.vi v9, v8, 4
-; RV64V-NEXT:    vand.vx v8, v8, s0
-; RV64V-NEXT:    vand.vx v9, v9, s0
-; RV64V-NEXT:    vsll.vi v8, v8, 4
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 2
-; RV64V-NEXT:    vand.vx v8, v8, s1
-; RV64V-NEXT:    vand.vx v9, v9, s1
-; RV64V-NEXT:    vsll.vi v8, v8, 2
-; RV64V-NEXT:    vor.vv v8, v9, v8
-; RV64V-NEXT:    vsrl.vi v9, v8, 1
-; RV64V-NEXT:    vand.vx v8, v8, s2
-; RV64V-NEXT:    vand.vx v9, v9, s2
-; RV64V-NEXT:    vadd.vv v8, v8, v8
-; RV64V-NEXT:    vor.vv v9, v9, v8
+; RV64V-NEXT:    addi a3, a3, 304
+; RV64V-NEXT:    vl8r.v v0, (a3) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vsrl.vx v8, v8, a2
+; RV64V-NEXT:    vand.vx v8, v8, a1
+; RV64V-NEXT:    vsrl.vx v24, v24, a0
+; RV64V-NEXT:    vor.vv v8, v8, v24
 ; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 1
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    slli a0, a0, 4
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    addi a0, a0, 304
+; RV64V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v8, v24, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
 ; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
 ; RV64V-NEXT:    mv a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 2
-; RV64V-NEXT:    add a0, a0, a1
-; RV64V-NEXT:    add a0, sp, a0
-; RV64V-NEXT:    addi a0, a0, 112
-; RV64V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV64V-NEXT:    vsrl.vi v8, v9, 1, v0.t
-; RV64V-NEXT:    csrr a0, vlenb
-; RV64V-NEXT:    mv a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
-; RV64V-NEXT:    slli a0, a0, 1
-; RV64V-NEXT:    add a1, a1, a0
 ; RV64V-NEXT:    slli a0, a0, 2
 ; RV64V-NEXT:    add a0, a0, a1
 ; RV64V-NEXT:    add sp, sp, a0
-; RV64V-NEXT:    .cfi_def_cfa sp, 224
-; RV64V-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64V-NEXT:    .cfi_restore ra
-; RV64V-NEXT:    .cfi_restore s0
-; RV64V-NEXT:    .cfi_restore s1
-; RV64V-NEXT:    .cfi_restore s2
-; RV64V-NEXT:    .cfi_restore s3
-; RV64V-NEXT:    .cfi_restore s4
-; RV64V-NEXT:    .cfi_restore s5
-; RV64V-NEXT:    .cfi_restore s6
-; RV64V-NEXT:    .cfi_restore s7
-; RV64V-NEXT:    .cfi_restore s8
-; RV64V-NEXT:    .cfi_restore s9
-; RV64V-NEXT:    .cfi_restore s10
-; RV64V-NEXT:    .cfi_restore s11
-; RV64V-NEXT:    addi sp, sp, 224
-; RV64V-NEXT:    .cfi_def_cfa_offset 0
+; RV64V-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 400(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s1, 392(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s2, 384(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s3, 376(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s4, 368(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s5, 360(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s6, 352(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s7, 344(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s8, 336(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s9, 328(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s10, 320(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s11, 312(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    addi sp, sp, 416
 ; RV64V-NEXT:    ret
 ;
-; RV32ZVBC-LABEL: clmulh_nxv1i64_vv_mask:
+; RV32ZVBC-LABEL: clmulh_nxv8i64_vv:
 ; RV32ZVBC:       # %bb.0:
-; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9, v0.t
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v16
 ; RV32ZVBC-NEXT:    ret
 ;
-; RV64ZVBC-LABEL: clmulh_nxv1i64_vv_mask:
+; RV64ZVBC-LABEL: clmulh_nxv8i64_vv:
 ; RV64ZVBC:       # %bb.0:
-; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v9, v0.t
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v16
 ; RV64ZVBC-NEXT:    ret
-  %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
-  %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
-  %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i128> %clmul, splat(i128 64)
-  %res = trunc <vscale x 1 x i128> %res.ext to <vscale x 1 x i64>
-  %sel = select <vscale x 1 x i1> %mask, <vscale x 1 x i64> %res, <vscale x 1 x i64> %va
-  ret <vscale x 1 x i64> %sel
+  %va.ext = zext <vscale x 8 x i64> %va to <vscale x 8 x i128>
+  %vb.ext = zext <vscale x 8 x i64> %vb to <vscale x 8 x i128>
+  %clmul = call <vscale x 8 x i128> @llvm.clmul.nxv8i128(<vscale x 8 x i128> %va.ext, <vscale x 8 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 8 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 8 x i128> %res.ext to <vscale x 8 x i64>
+  ret <vscale x 8 x i64> %res
 }
 
-define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %mask) {
-; RV32V-LABEL: clmulh_nxv1i64_vx_mask:
+define <vscale x 8 x i64> @clmulh_nxv8i64_vx(<vscale x 8 x i64> %va, i64 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv8i64_vx:
 ; RV32V:       # %bb.0:
 ; RV32V-NEXT:    addi sp, sp, -368
-; RV32V-NEXT:    .cfi_def_cfa_offset 368
 ; RV32V-NEXT:    sw ra, 364(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s0, 360(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s1, 356(sp) # 4-byte Folded Spill
@@ -53909,372 +37686,2813 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
 ; RV32V-NEXT:    sw s9, 324(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s10, 320(sp) # 4-byte Folded Spill
 ; RV32V-NEXT:    sw s11, 316(sp) # 4-byte Folded Spill
-; RV32V-NEXT:    .cfi_offset ra, -4
-; RV32V-NEXT:    .cfi_offset s0, -8
-; RV32V-NEXT:    .cfi_offset s1, -12
-; RV32V-NEXT:    .cfi_offset s2, -16
-; RV32V-NEXT:    .cfi_offset s3, -20
-; RV32V-NEXT:    .cfi_offset s4, -24
-; RV32V-NEXT:    .cfi_offset s5, -28
-; RV32V-NEXT:    .cfi_offset s6, -32
-; RV32V-NEXT:    .cfi_offset s7, -36
-; RV32V-NEXT:    .cfi_offset s8, -40
-; RV32V-NEXT:    .cfi_offset s9, -44
-; RV32V-NEXT:    .cfi_offset s10, -48
-; RV32V-NEXT:    .cfi_offset s11, -52
 ; RV32V-NEXT:    csrr a2, vlenb
+; RV32V-NEXT:    slli a2, a2, 5
 ; RV32V-NEXT:    mv a3, a2
-; RV32V-NEXT:    slli a2, a2, 3
+; RV32V-NEXT:    slli a2, a2, 2
 ; RV32V-NEXT:    add a3, a3, a2
 ; RV32V-NEXT:    slli a2, a2, 2
 ; RV32V-NEXT:    add a2, a2, a3
 ; RV32V-NEXT:    sub sp, sp, a2
-; RV32V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x02, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 368 + 41 * vlenb
 ; RV32V-NEXT:    csrr a2, vlenb
 ; RV32V-NEXT:    slli a2, a2, 3
 ; RV32V-NEXT:    mv a3, a2
+; RV32V-NEXT:    slli a2, a2, 4
+; RV32V-NEXT:    add a3, a3, a2
 ; RV32V-NEXT:    slli a2, a2, 2
 ; RV32V-NEXT:    add a2, a2, a3
 ; RV32V-NEXT:    add a2, sp, a2
 ; RV32V-NEXT:    addi a2, a2, 304
-; RV32V-NEXT:    vs1r.v v0, (a2) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    sw a0, 16(sp)
 ; RV32V-NEXT:    sw a1, 20(sp)
-; RV32V-NEXT:    addi s6, sp, 16
-; RV32V-NEXT:    lui s7, 1044480
-; RV32V-NEXT:    li s11, 1
-; RV32V-NEXT:    li s8, 2
-; RV32V-NEXT:    li s2, 4
-; RV32V-NEXT:    li s10, 8
-; RV32V-NEXT:    li s5, 32
-; RV32V-NEXT:    li s4, 64
-; RV32V-NEXT:    li s3, 128
-; RV32V-NEXT:    li ra, 256
-; RV32V-NEXT:    li s1, 512
-; RV32V-NEXT:    li s0, 1024
-; RV32V-NEXT:    lui t6, 1
-; RV32V-NEXT:    lui t5, 2
-; RV32V-NEXT:    lui t4, 4
-; RV32V-NEXT:    lui t3, 8
-; RV32V-NEXT:    lui t2, 16
-; RV32V-NEXT:    lui t1, 32
-; RV32V-NEXT:    lui t0, 64
-; RV32V-NEXT:    lui a7, 128
-; RV32V-NEXT:    lui a6, 256
-; RV32V-NEXT:    lui a5, 512
-; RV32V-NEXT:    lui a4, 1024
-; RV32V-NEXT:    lui a3, 2048
-; RV32V-NEXT:    lui a2, 4096
-; RV32V-NEXT:    lui a1, 8192
-; RV32V-NEXT:    vsetvli s9, zero, e64, m1, ta, ma
-; RV32V-NEXT:    vlse64.v v13, (s6), zero
-; RV32V-NEXT:    lui s6, 16384
-; RV32V-NEXT:    sw s7, 288(sp)
-; RV32V-NEXT:    lui s7, 32768
-; RV32V-NEXT:    sw zero, 292(sp)
-; RV32V-NEXT:    lui a0, 524288
-; RV32V-NEXT:    sw a0, 280(sp)
-; RV32V-NEXT:    sw zero, 284(sp)
+; RV32V-NEXT:    addi a2, sp, 16
+; RV32V-NEXT:    lui t6, 16
+; RV32V-NEXT:    li t5, 56
+; RV32V-NEXT:    li t4, 40
+; RV32V-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vlse64.v v24, (a2), zero
+; RV32V-NEXT:    vsrl.vx v16, v8, t5
+; RV32V-NEXT:    vsrl.vx v0, v8, t4
+; RV32V-NEXT:    addi t3, t6, -256
+; RV32V-NEXT:    vand.vx v0, v0, t3
+; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v16, v8, t3
+; RV32V-NEXT:    vsll.vx v16, v16, t4
+; RV32V-NEXT:    vsll.vx v0, v8, t5
+; RV32V-NEXT:    vor.vv v8, v0, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsrl.vx v8, v24, t4
+; RV32V-NEXT:    vand.vx v8, v8, t3
+; RV32V-NEXT:    vsrl.vx v0, v24, t5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vor.vv v8, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v8, v24, t3
+; RV32V-NEXT:    vsll.vx v8, v8, t4
+; RV32V-NEXT:    vsll.vx v0, v24, t5
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a4, 1044480
+; RV32V-NEXT:    lui a5, 524288
+; RV32V-NEXT:    li ra, 1
+; RV32V-NEXT:    li a6, 2
+; RV32V-NEXT:    li a7, 4
+; RV32V-NEXT:    li s0, 8
+; RV32V-NEXT:    li s11, 16
+; RV32V-NEXT:    li s10, 32
+; RV32V-NEXT:    li s9, 64
+; RV32V-NEXT:    li s8, 128
+; RV32V-NEXT:    li s7, 256
+; RV32V-NEXT:    li s6, 512
+; RV32V-NEXT:    li s5, 1024
+; RV32V-NEXT:    lui s4, 1
+; RV32V-NEXT:    lui s3, 2
+; RV32V-NEXT:    lui s2, 4
+; RV32V-NEXT:    lui s1, 8
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    lui a1, 64
+; RV32V-NEXT:    lui a2, 128
+; RV32V-NEXT:    lui a3, 256
+; RV32V-NEXT:    lui t1, 512
+; RV32V-NEXT:    lui t0, 1024
+; RV32V-NEXT:    lui t2, 2048
+; RV32V-NEXT:    sw a4, 264(sp)
+; RV32V-NEXT:    lui a4, 4096
+; RV32V-NEXT:    sw zero, 268(sp)
+; RV32V-NEXT:    sw a5, 24(sp)
+; RV32V-NEXT:    sw zero, 28(sp)
+; RV32V-NEXT:    sw zero, 288(sp)
+; RV32V-NEXT:    sw ra, 292(sp)
 ; RV32V-NEXT:    sw zero, 272(sp)
-; RV32V-NEXT:    sw s11, 276(sp)
-; RV32V-NEXT:    sw zero, 264(sp)
-; RV32V-NEXT:    sw s8, 268(sp)
-; RV32V-NEXT:    lui s8, 65536
+; RV32V-NEXT:    sw a6, 276(sp)
+; RV32V-NEXT:    lui a6, 8192
+; RV32V-NEXT:    sw zero, 280(sp)
+; RV32V-NEXT:    sw a7, 284(sp)
+; RV32V-NEXT:    lui a7, 16384
 ; RV32V-NEXT:    sw zero, 256(sp)
-; RV32V-NEXT:    sw s2, 260(sp)
-; RV32V-NEXT:    lui s9, 131072
+; RV32V-NEXT:    sw s0, 260(sp)
+; RV32V-NEXT:    lui s0, 32768
 ; RV32V-NEXT:    sw zero, 248(sp)
-; RV32V-NEXT:    sw s10, 252(sp)
-; RV32V-NEXT:    lui s10, 262144
+; RV32V-NEXT:    sw s11, 252(sp)
 ; RV32V-NEXT:    sw zero, 240(sp)
-; RV32V-NEXT:    li s2, 16
-; RV32V-NEXT:    sw s2, 244(sp)
+; RV32V-NEXT:    sw s10, 244(sp)
 ; RV32V-NEXT:    sw zero, 232(sp)
-; RV32V-NEXT:    sw s5, 236(sp)
+; RV32V-NEXT:    sw s9, 236(sp)
 ; RV32V-NEXT:    sw zero, 224(sp)
-; RV32V-NEXT:    sw s4, 228(sp)
+; RV32V-NEXT:    sw s8, 228(sp)
 ; RV32V-NEXT:    sw zero, 216(sp)
-; RV32V-NEXT:    sw s3, 220(sp)
+; RV32V-NEXT:    sw s7, 220(sp)
 ; RV32V-NEXT:    sw zero, 208(sp)
-; RV32V-NEXT:    sw ra, 212(sp)
+; RV32V-NEXT:    sw s6, 212(sp)
 ; RV32V-NEXT:    sw zero, 200(sp)
-; RV32V-NEXT:    sw s1, 204(sp)
+; RV32V-NEXT:    sw s5, 204(sp)
+; RV32V-NEXT:    slli ra, ra, 11
 ; RV32V-NEXT:    sw zero, 192(sp)
-; RV32V-NEXT:    sw s0, 196(sp)
-; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw ra, 196(sp)
 ; RV32V-NEXT:    sw zero, 184(sp)
-; RV32V-NEXT:    sw s11, 188(sp)
+; RV32V-NEXT:    sw s4, 188(sp)
 ; RV32V-NEXT:    sw zero, 176(sp)
-; RV32V-NEXT:    sw t6, 180(sp)
+; RV32V-NEXT:    sw s3, 180(sp)
 ; RV32V-NEXT:    sw zero, 168(sp)
-; RV32V-NEXT:    sw t5, 172(sp)
+; RV32V-NEXT:    sw s2, 172(sp)
 ; RV32V-NEXT:    sw zero, 160(sp)
-; RV32V-NEXT:    sw t4, 164(sp)
-; RV32V-NEXT:    lui s2, 4
+; RV32V-NEXT:    sw s1, 164(sp)
 ; RV32V-NEXT:    sw zero, 152(sp)
-; RV32V-NEXT:    sw t3, 156(sp)
-; RV32V-NEXT:    lui s4, 8
+; RV32V-NEXT:    sw t6, 156(sp)
 ; RV32V-NEXT:    sw zero, 144(sp)
-; RV32V-NEXT:    sw t2, 148(sp)
-; RV32V-NEXT:    lui s3, 16
+; RV32V-NEXT:    sw a0, 148(sp)
 ; RV32V-NEXT:    sw zero, 136(sp)
-; RV32V-NEXT:    sw t1, 140(sp)
-; RV32V-NEXT:    lui t2, 32
+; RV32V-NEXT:    sw a1, 140(sp)
 ; RV32V-NEXT:    sw zero, 128(sp)
-; RV32V-NEXT:    sw t0, 132(sp)
-; RV32V-NEXT:    lui t3, 64
+; RV32V-NEXT:    sw a2, 132(sp)
 ; RV32V-NEXT:    sw zero, 120(sp)
-; RV32V-NEXT:    sw a7, 124(sp)
-; RV32V-NEXT:    lui t4, 128
+; RV32V-NEXT:    sw a3, 124(sp)
 ; RV32V-NEXT:    sw zero, 112(sp)
-; RV32V-NEXT:    sw a6, 116(sp)
-; RV32V-NEXT:    lui t5, 256
+; RV32V-NEXT:    sw t1, 116(sp)
 ; RV32V-NEXT:    sw zero, 104(sp)
-; RV32V-NEXT:    sw a5, 108(sp)
-; RV32V-NEXT:    lui t6, 512
+; RV32V-NEXT:    sw t0, 108(sp)
 ; RV32V-NEXT:    sw zero, 96(sp)
-; RV32V-NEXT:    sw a4, 100(sp)
-; RV32V-NEXT:    lui s0, 1024
+; RV32V-NEXT:    sw t2, 100(sp)
 ; RV32V-NEXT:    sw zero, 88(sp)
-; RV32V-NEXT:    sw a3, 92(sp)
-; RV32V-NEXT:    lui a7, 2048
+; RV32V-NEXT:    sw a4, 92(sp)
 ; RV32V-NEXT:    sw zero, 80(sp)
-; RV32V-NEXT:    sw a2, 84(sp)
-; RV32V-NEXT:    lui s1, 4096
-; RV32V-NEXT:    sw zero, 72(sp)
-; RV32V-NEXT:    sw a1, 76(sp)
-; RV32V-NEXT:    lui t0, 8192
-; RV32V-NEXT:    sw zero, 64(sp)
-; RV32V-NEXT:    sw s6, 68(sp)
-; RV32V-NEXT:    sw zero, 56(sp)
-; RV32V-NEXT:    sw s7, 60(sp)
-; RV32V-NEXT:    sw zero, 48(sp)
-; RV32V-NEXT:    sw s8, 52(sp)
-; RV32V-NEXT:    sw zero, 40(sp)
-; RV32V-NEXT:    sw s9, 44(sp)
-; RV32V-NEXT:    sw zero, 32(sp)
-; RV32V-NEXT:    sw s10, 36(sp)
-; RV32V-NEXT:    sw zero, 24(sp)
-; RV32V-NEXT:    sw a0, 28(sp)
-; RV32V-NEXT:    lui a0, 61681
-; RV32V-NEXT:    addi a0, a0, -241
-; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32V-NEXT:    vmv.v.x v2, a0
-; RV32V-NEXT:    lui a0, 209715
-; RV32V-NEXT:    addi a0, a0, 819
-; RV32V-NEXT:    vmv.v.x v1, a0
-; RV32V-NEXT:    lui a0, 349525
-; RV32V-NEXT:    addi a0, a0, 1365
-; RV32V-NEXT:    vmv.v.x v0, a0
-; RV32V-NEXT:    addi a0, sp, 288
-; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV32V-NEXT:    vlse64.v v4, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 280
-; RV32V-NEXT:    vlse64.v v9, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 272
-; RV32V-NEXT:    vlse64.v v10, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 264
-; RV32V-NEXT:    vlse64.v v11, (a0), zero
+; RV32V-NEXT:    sw a6, 84(sp)
+; RV32V-NEXT:    lui a2, 8192
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a7, 76(sp)
+; RV32V-NEXT:    lui t2, 16384
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw s0, 68(sp)
+; RV32V-NEXT:    lui a7, 65536
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw a7, 60(sp)
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw a6, 52(sp)
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw a4, 44(sp)
+; RV32V-NEXT:    sw a5, 36(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    lui a3, 4080
+; RV32V-NEXT:    addi t0, sp, 264
+; RV32V-NEXT:    vlse64.v v16, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vsrl.vi v8, v24, 24
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    vsrl.vi v0, v24, 8
+; RV32V-NEXT:    vand.vv v0, v0, v16
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vsrl.vi v0, v24, 24
+; RV32V-NEXT:    vand.vx v0, v0, a3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vsrl.vi v24, v24, 8
+; RV32V-NEXT:    vand.vv v24, v24, v16
+; RV32V-NEXT:    vor.vv v24, v24, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v8, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vx v0, v16, a3
+; RV32V-NEXT:    vsll.vi v0, v0, 24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v16, v16, v8
+; RV32V-NEXT:    vsll.vi v16, v16, 8
+; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vx v0, v8, a3
+; RV32V-NEXT:    vsll.vi v0, v0, 24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v24, v8
+; RV32V-NEXT:    vsll.vi v8, v8, 8
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v16, v0, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v0, v16, v24
+; RV32V-NEXT:    lui t0, 61681
+; RV32V-NEXT:    addi t0, t0, -241
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v16, v8, v16
+; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v8, t0
+; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v24, v0, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v0, v8
+; RV32V-NEXT:    vmv8r.v v0, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vsll.vi v24, v24, 4
+; RV32V-NEXT:    vor.vv v8, v8, v24
+; RV32V-NEXT:    vsrl.vi v24, v16, 4
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vand.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vi v16, v16, 4
+; RV32V-NEXT:    vor.vv v16, v24, v16
+; RV32V-NEXT:    lui t0, 209715
+; RV32V-NEXT:    addi t0, t0, 819
+; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v0, t0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v24, v8, 2
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    vsrl.vi v24, v16, 2
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vand.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vi v16, v16, 2
+; RV32V-NEXT:    vor.vv v24, v24, v16
+; RV32V-NEXT:    lui t0, 349525
+; RV32V-NEXT:    addi t0, t0, 1365
+; RV32V-NEXT:    vsetvli t1, zero, e32, m8, ta, ma
+; RV32V-NEXT:    vmv.v.x v0, t0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsetvli t0, zero, e64, m8, ta, ma
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
+; RV32V-NEXT:    vand.vv v8, v8, v0
+; RV32V-NEXT:    vand.vv v16, v16, v0
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v16, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v24, 1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v8, v24, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi t0, sp, 24
+; RV32V-NEXT:    vlse64.v v0, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vadd.vv v24, v24, v24
+; RV32V-NEXT:    vor.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi t0, sp, 288
+; RV32V-NEXT:    addi t1, sp, 272
+; RV32V-NEXT:    addi a1, sp, 280
 ; RV32V-NEXT:    addi a0, sp, 256
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 248
-; RV32V-NEXT:    vlse64.v v14, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 240
-; RV32V-NEXT:    vlse64.v v18, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 232
-; RV32V-NEXT:    vlse64.v v19, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 224
-; RV32V-NEXT:    vlse64.v v20, (a0), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a5, vlenb
+; RV32V-NEXT:    slli a5, a5, 4
+; RV32V-NEXT:    mv t0, a5
+; RV32V-NEXT:    slli a5, a5, 1
+; RV32V-NEXT:    add t0, t0, a5
+; RV32V-NEXT:    slli a5, a5, 1
+; RV32V-NEXT:    add t0, t0, a5
+; RV32V-NEXT:    slli a5, a5, 3
+; RV32V-NEXT:    add a5, a5, t0
+; RV32V-NEXT:    add a5, sp, a5
+; RV32V-NEXT:    addi a5, a5, 304
+; RV32V-NEXT:    vs8r.v v24, (a5) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (t1), zero
+; RV32V-NEXT:    vlse64.v v24, (a1), zero
+; RV32V-NEXT:    csrr a1, vlenb
+; RV32V-NEXT:    slli a1, a1, 5
+; RV32V-NEXT:    mv a5, a1
+; RV32V-NEXT:    slli a1, a1, 1
+; RV32V-NEXT:    add a5, a5, a1
+; RV32V-NEXT:    slli a1, a1, 3
+; RV32V-NEXT:    add a1, a1, a5
+; RV32V-NEXT:    add a1, sp, a1
+; RV32V-NEXT:    addi a1, a1, 304
+; RV32V-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 248
+; RV32V-NEXT:    addi a1, sp, 240
+; RV32V-NEXT:    addi t0, sp, 232
+; RV32V-NEXT:    addi t1, sp, 224
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a5, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    addi a0, sp, 216
-; RV32V-NEXT:    vlse64.v v21, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 208
-; RV32V-NEXT:    vlse64.v v22, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 200
-; RV32V-NEXT:    vlse64.v v23, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 192
+; RV32V-NEXT:    addi a1, sp, 208
+; RV32V-NEXT:    addi t0, sp, 200
+; RV32V-NEXT:    addi t1, sp, 192
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 184
+; RV32V-NEXT:    addi a1, sp, 176
+; RV32V-NEXT:    addi t0, sp, 168
+; RV32V-NEXT:    addi t1, sp, 160
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a5, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 152
+; RV32V-NEXT:    addi a1, sp, 144
+; RV32V-NEXT:    addi t0, sp, 136
+; RV32V-NEXT:    addi t1, sp, 128
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 120
+; RV32V-NEXT:    addi a1, sp, 112
+; RV32V-NEXT:    addi t0, sp, 104
+; RV32V-NEXT:    addi t1, sp, 96
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a5, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 88
+; RV32V-NEXT:    addi a1, sp, 80
+; RV32V-NEXT:    addi t0, sp, 72
+; RV32V-NEXT:    addi t1, sp, 64
 ; RV32V-NEXT:    vlse64.v v24, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 184
-; RV32V-NEXT:    vlse64.v v25, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 176
-; RV32V-NEXT:    vlse64.v v26, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 168
-; RV32V-NEXT:    vlse64.v v27, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 160
-; RV32V-NEXT:    vlse64.v v28, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 152
-; RV32V-NEXT:    vlse64.v v29, (a0), zero
-; RV32V-NEXT:    addi a0, sp, 144
-; RV32V-NEXT:    vlse64.v v30, (a0), zero
-; RV32V-NEXT:    li a6, 56
-; RV32V-NEXT:    vmv1r.v v31, v8
-; RV32V-NEXT:    vsrl.vi v8, v8, 24
-; RV32V-NEXT:    vsrl.vi v15, v31, 8
-; RV32V-NEXT:    vsrl.vx v16, v31, a6
-; RV32V-NEXT:    li ra, 40
-; RV32V-NEXT:    vsrl.vx v17, v31, ra
-; RV32V-NEXT:    lui a5, 4080
-; RV32V-NEXT:    vand.vx v8, v8, a5
-; RV32V-NEXT:    vsll.vx v7, v31, a6
-; RV32V-NEXT:    addi a4, s3, -256
-; RV32V-NEXT:    vand.vx v17, v17, a4
-; RV32V-NEXT:    vand.vx v6, v31, a4
-; RV32V-NEXT:    vor.vv v16, v17, v16
-; RV32V-NEXT:    vsll.vx v17, v6, ra
-; RV32V-NEXT:    vor.vv v17, v7, v17
-; RV32V-NEXT:    vsrl.vx v7, v13, a6
-; RV32V-NEXT:    vsrl.vx v6, v13, ra
-; RV32V-NEXT:    vsll.vx v5, v13, a6
-; RV32V-NEXT:    vand.vx v6, v6, a4
-; RV32V-NEXT:    vor.vv v7, v6, v7
-; RV32V-NEXT:    vand.vx v6, v13, a4
-; RV32V-NEXT:    vsll.vx v6, v6, ra
-; RV32V-NEXT:    vor.vv v6, v5, v6
-; RV32V-NEXT:    vsrl.vi v5, v13, 24
-; RV32V-NEXT:    vand.vv v15, v15, v4
-; RV32V-NEXT:    vor.vv v8, v15, v8
-; RV32V-NEXT:    vsrl.vi v15, v13, 8
-; RV32V-NEXT:    vand.vx v5, v5, a5
-; RV32V-NEXT:    vand.vv v15, v15, v4
-; RV32V-NEXT:    vor.vv v5, v15, v5
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 56
+; RV32V-NEXT:    addi a1, sp, 48
+; RV32V-NEXT:    addi t0, sp, 40
+; RV32V-NEXT:    addi t1, sp, 32
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a5, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a5, a5, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v0, (a1), zero
+; RV32V-NEXT:    vlse64.v v24, (t0), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vlse64.v v24, (t1), zero
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vv v24, v8, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v24, v8, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s11
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s10
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s9
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s5
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, ra
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s3
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s1
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, t6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vand.vx v24, v8, a0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, t2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, s0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a7
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a6
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vx v24, v8, a4
+; RV32V-NEXT:    addi a0, sp, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v24, v8, 2
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v0, v8, 1
+; RV32V-NEXT:    vand.vi v24, v8, 4
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vand.vi v8, v8, 8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v24, v16, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v24, v16, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v0, v16, v0
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vx v15, v31, a5
-; RV32V-NEXT:    vsll.vi v15, v15, 24
-; RV32V-NEXT:    vor.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v16, v31, v4
-; RV32V-NEXT:    vmv.v.v v31, v4
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vsll.vi v16, v16, 8
-; RV32V-NEXT:    vor.vv v4, v15, v16
-; RV32V-NEXT:    addi a3, sp, 136
-; RV32V-NEXT:    vlse64.v v15, (a3), zero
-; RV32V-NEXT:    vor.vv v7, v5, v7
-; RV32V-NEXT:    vand.vx v16, v13, a5
-; RV32V-NEXT:    vsll.vi v16, v16, 24
-; RV32V-NEXT:    vand.vv v13, v13, v31
-; RV32V-NEXT:    vsll.vi v13, v13, 8
-; RV32V-NEXT:    vor.vv v13, v16, v13
-; RV32V-NEXT:    addi a3, sp, 128
-; RV32V-NEXT:    vlse64.v v16, (a3), zero
-; RV32V-NEXT:    vor.vv v5, v17, v4
-; RV32V-NEXT:    addi a3, sp, 120
-; RV32V-NEXT:    vlse64.v v17, (a3), zero
-; RV32V-NEXT:    vor.vv v13, v6, v13
-; RV32V-NEXT:    addi a3, sp, 112
-; RV32V-NEXT:    vlse64.v v4, (a3), zero
-; RV32V-NEXT:    vor.vv v8, v5, v8
-; RV32V-NEXT:    addi a3, sp, 104
-; RV32V-NEXT:    vlse64.v v3, (a3), zero
-; RV32V-NEXT:    vor.vv v13, v13, v7
-; RV32V-NEXT:    vsrl.vi v7, v8, 4
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    addi a0, sp, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v2
-; RV32V-NEXT:    vand.vv v7, v7, v2
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v7, v8
-; RV32V-NEXT:    vsrl.vi v7, v13, 4
-; RV32V-NEXT:    vand.vv v13, v13, v2
-; RV32V-NEXT:    vand.vv v7, v7, v2
-; RV32V-NEXT:    vsll.vi v13, v13, 4
-; RV32V-NEXT:    vor.vv v13, v7, v13
-; RV32V-NEXT:    vsrl.vi v7, v8, 2
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v1
-; RV32V-NEXT:    vand.vv v7, v7, v1
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v7, v8
-; RV32V-NEXT:    vsrl.vi v7, v13, 2
-; RV32V-NEXT:    vand.vv v13, v13, v1
-; RV32V-NEXT:    vand.vv v7, v7, v1
-; RV32V-NEXT:    vsll.vi v13, v13, 2
-; RV32V-NEXT:    vor.vv v7, v7, v13
-; RV32V-NEXT:    vsrl.vi v13, v8, 1
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vand.vv v13, v13, v0
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v13, v13, v8
-; RV32V-NEXT:    vsrl.vi v8, v7, 1
-; RV32V-NEXT:    vand.vv v7, v7, v0
-; RV32V-NEXT:    vand.vv v8, v8, v0
-; RV32V-NEXT:    vadd.vv v7, v7, v7
-; RV32V-NEXT:    vor.vv v8, v8, v7
-; RV32V-NEXT:    addi a3, sp, 96
-; RV32V-NEXT:    vlse64.v v2, (a3), zero
-; RV32V-NEXT:    vand.vv v9, v8, v9
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 8
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v10
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v11
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 9
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v12
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v14
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
@@ -54283,579 +40501,418 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v18
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v19
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v20
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v21
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v22
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v23
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v24
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v25
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v26
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v27
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v28
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v29
-; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v30
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v15
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v16
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v17
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 88
-; RV32V-NEXT:    vlse64.v v9, (a3), zero
-; RV32V-NEXT:    vand.vv v10, v8, v4
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v3
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v10, v8, v2
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v9
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a3, sp, 80
-; RV32V-NEXT:    addi a2, sp, 72
-; RV32V-NEXT:    addi a1, sp, 64
-; RV32V-NEXT:    addi a0, sp, 56
-; RV32V-NEXT:    vlse64.v v9, (a3), zero
-; RV32V-NEXT:    vlse64.v v10, (a2), zero
-; RV32V-NEXT:    vlse64.v v11, (a1), zero
-; RV32V-NEXT:    vlse64.v v12, (a0), zero
-; RV32V-NEXT:    vand.vv v9, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v10
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    addi a0, sp, 48
-; RV32V-NEXT:    addi a1, sp, 40
-; RV32V-NEXT:    addi a2, sp, 32
-; RV32V-NEXT:    addi a3, sp, 24
-; RV32V-NEXT:    vlse64.v v9, (a0), zero
-; RV32V-NEXT:    vlse64.v v10, (a1), zero
-; RV32V-NEXT:    vlse64.v v11, (a2), zero
-; RV32V-NEXT:    vlse64.v v12, (a3), zero
-; RV32V-NEXT:    vand.vv v9, v8, v9
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v10
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v11
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vand.vv v9, v8, v12
-; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    li a0, 16
-; RV32V-NEXT:    vand.vx v2, v8, a0
-; RV32V-NEXT:    li a0, 32
-; RV32V-NEXT:    vand.vx v1, v8, a0
-; RV32V-NEXT:    li a0, 64
-; RV32V-NEXT:    vand.vx v0, v8, a0
-; RV32V-NEXT:    li a0, 128
-; RV32V-NEXT:    vand.vx v12, v8, a0
-; RV32V-NEXT:    li a0, 256
-; RV32V-NEXT:    vand.vx v14, v8, a0
-; RV32V-NEXT:    li a0, 512
-; RV32V-NEXT:    vand.vx v15, v8, a0
-; RV32V-NEXT:    li a0, 1024
-; RV32V-NEXT:    vand.vx v16, v8, a0
-; RV32V-NEXT:    vand.vx v17, v8, s11
-; RV32V-NEXT:    lui a0, 1
-; RV32V-NEXT:    vand.vx v18, v8, a0
-; RV32V-NEXT:    lui a0, 2
-; RV32V-NEXT:    vand.vx v19, v8, a0
-; RV32V-NEXT:    vand.vx v20, v8, s2
-; RV32V-NEXT:    vand.vx v21, v8, s4
-; RV32V-NEXT:    vand.vx v22, v8, s3
-; RV32V-NEXT:    vand.vx v23, v8, t2
-; RV32V-NEXT:    vand.vx v24, v8, t3
-; RV32V-NEXT:    vand.vx v25, v8, t4
-; RV32V-NEXT:    vand.vx v26, v8, t5
-; RV32V-NEXT:    vand.vx v27, v8, t6
-; RV32V-NEXT:    vand.vx v28, v8, s0
-; RV32V-NEXT:    vand.vx v29, v8, a7
-; RV32V-NEXT:    vand.vx v30, v8, s1
-; RV32V-NEXT:    vand.vx v31, v8, t0
-; RV32V-NEXT:    vand.vx v7, v8, s6
-; RV32V-NEXT:    vand.vx v6, v8, s7
-; RV32V-NEXT:    vand.vx v5, v8, s8
-; RV32V-NEXT:    vand.vx v4, v8, s9
-; RV32V-NEXT:    vand.vx v3, v8, s10
-; RV32V-NEXT:    vand.vi v9, v8, 2
-; RV32V-NEXT:    vand.vi v10, v8, 1
-; RV32V-NEXT:    vand.vi v11, v8, 4
-; RV32V-NEXT:    vand.vi v8, v8, 8
-; RV32V-NEXT:    vmul.vv v9, v13, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v10, v13, v10
-; RV32V-NEXT:    vmul.vv v11, v13, v11
-; RV32V-NEXT:    vmul.vv v8, v13, v8
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    vmul.vv v2, v13, v2
-; RV32V-NEXT:    vmul.vv v1, v13, v1
-; RV32V-NEXT:    vmul.vv v0, v13, v0
-; RV32V-NEXT:    vmul.vv v12, v13, v12
-; RV32V-NEXT:    vmul.vv v14, v13, v14
-; RV32V-NEXT:    vmul.vv v15, v13, v15
-; RV32V-NEXT:    vmul.vv v16, v13, v16
-; RV32V-NEXT:    vmul.vv v17, v13, v17
-; RV32V-NEXT:    vmul.vv v18, v13, v18
-; RV32V-NEXT:    vmul.vv v19, v13, v19
-; RV32V-NEXT:    vmul.vv v20, v13, v20
-; RV32V-NEXT:    vmul.vv v21, v13, v21
-; RV32V-NEXT:    vmul.vv v22, v13, v22
-; RV32V-NEXT:    vmul.vv v23, v13, v23
-; RV32V-NEXT:    vmul.vv v24, v13, v24
-; RV32V-NEXT:    vmul.vv v25, v13, v25
-; RV32V-NEXT:    vmul.vv v26, v13, v26
-; RV32V-NEXT:    vmul.vv v27, v13, v27
-; RV32V-NEXT:    vmul.vv v28, v13, v28
-; RV32V-NEXT:    vmul.vv v29, v13, v29
-; RV32V-NEXT:    vmul.vv v30, v13, v30
-; RV32V-NEXT:    vmul.vv v31, v13, v31
-; RV32V-NEXT:    vmul.vv v7, v13, v7
-; RV32V-NEXT:    vmul.vv v6, v13, v6
-; RV32V-NEXT:    vmul.vv v5, v13, v5
-; RV32V-NEXT:    vmul.vv v4, v13, v4
-; RV32V-NEXT:    vmul.vv v3, v13, v3
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v9, v13, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vmul.vv v8, v16, v8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -54865,122 +40922,155 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -54988,53 +41078,78 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vsll.vx v8, v8, t5
+; RV32V-NEXT:    vand.vx v16, v16, t3
+; RV32V-NEXT:    vsll.vx v16, v16, t4
+; RV32V-NEXT:    vor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
@@ -55042,564 +41157,3924 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
-; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 1
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v8, v13, v8
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v24, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 8
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vmul.vv v13, v13, v8
-; RV32V-NEXT:    vxor.vi v10, v10, 0
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v10, v10, v8
-; RV32V-NEXT:    vxor.vv v10, v10, v11
-; RV32V-NEXT:    addi a0, sp, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v10, v8
-; RV32V-NEXT:    vxor.vv v8, v8, v2
-; RV32V-NEXT:    vxor.vv v8, v8, v1
-; RV32V-NEXT:    vxor.vv v8, v8, v0
-; RV32V-NEXT:    vxor.vv v8, v8, v12
-; RV32V-NEXT:    vxor.vv v8, v8, v14
-; RV32V-NEXT:    vxor.vv v8, v8, v15
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; RV32V-NEXT:    vxor.vv v8, v8, v16
-; RV32V-NEXT:    vxor.vv v8, v8, v17
-; RV32V-NEXT:    vxor.vv v8, v8, v18
-; RV32V-NEXT:    vxor.vv v8, v8, v19
-; RV32V-NEXT:    vxor.vv v8, v8, v20
-; RV32V-NEXT:    vxor.vv v8, v8, v21
-; RV32V-NEXT:    vxor.vv v8, v8, v22
-; RV32V-NEXT:    vxor.vv v8, v8, v23
-; RV32V-NEXT:    vxor.vv v8, v8, v24
-; RV32V-NEXT:    vxor.vv v8, v8, v25
-; RV32V-NEXT:    vxor.vv v8, v8, v26
-; RV32V-NEXT:    vxor.vv v8, v8, v27
-; RV32V-NEXT:    vxor.vv v8, v8, v28
-; RV32V-NEXT:    vxor.vv v8, v8, v29
-; RV32V-NEXT:    vxor.vv v8, v8, v30
-; RV32V-NEXT:    vxor.vv v8, v8, v31
-; RV32V-NEXT:    vxor.vv v8, v8, v7
-; RV32V-NEXT:    vxor.vv v8, v8, v6
-; RV32V-NEXT:    vxor.vv v8, v8, v5
-; RV32V-NEXT:    vxor.vv v8, v8, v4
-; RV32V-NEXT:    vxor.vv v8, v8, v3
-; RV32V-NEXT:    vxor.vv v8, v8, v9
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    slli a0, a0, 9
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 5
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v0, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v0, v16
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32V-NEXT:    vsrl.vi v0, v24, 8
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v0, v0, v16
+; RV32V-NEXT:    vsrl.vi v8, v8, 24
+; RV32V-NEXT:    vand.vx v8, v8, a3
+; RV32V-NEXT:    vor.vv v8, v0, v8
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 6
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v16, v8
+; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 6
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
+; RV32V-NEXT:    add a0, sp, a0
+; RV32V-NEXT:    addi a0, a0, 304
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v8, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v8, v16
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v16, v16, v0
+; RV32V-NEXT:    vand.vx v24, v24, a3
+; RV32V-NEXT:    vsll.vi v24, v24, 24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v0, v8, v0
+; RV32V-NEXT:    vsll.vi v0, v0, 8
+; RV32V-NEXT:    vor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v0, v16, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 4
-; RV32V-NEXT:    sub a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v16, v16, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v0, v24
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 3
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vxor.vv v24, v24, v0
+; RV32V-NEXT:    vsrl.vx v8, v8, t4
+; RV32V-NEXT:    vand.vx v8, v8, t3
+; RV32V-NEXT:    vsrl.vx v24, v24, t5
+; RV32V-NEXT:    vor.vv v8, v8, v24
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    add a0, a1, a0
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    mv a1, a0
+; RV32V-NEXT:    slli a0, a0, 5
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vor.vv v8, v24, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 4
 ; RV32V-NEXT:    csrr a0, vlenb
 ; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 3
-; RV32V-NEXT:    sub a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a1, a0, 2
-; RV32V-NEXT:    add a0, a1, a0
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vsll.vi v8, v8, 4
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 2
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 7
+; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
+; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vxor.vv v8, v8, v9
-; RV32V-NEXT:    vxor.vv v8, v8, v13
-; RV32V-NEXT:    vsrl.vx v9, v8, a6
-; RV32V-NEXT:    vsll.vx v10, v8, a6
-; RV32V-NEXT:    vsrl.vx v11, v8, ra
-; RV32V-NEXT:    vand.vx v12, v8, a4
-; RV32V-NEXT:    vand.vx v11, v11, a4
-; RV32V-NEXT:    vsrl.vi v13, v8, 24
-; RV32V-NEXT:    vand.vx v14, v8, a5
-; RV32V-NEXT:    vand.vx v13, v13, a5
-; RV32V-NEXT:    vsll.vx v12, v12, ra
-; RV32V-NEXT:    vsrl.vi v15, v8, 8
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vsll.vi v8, v8, 2
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v16, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 1
+; RV32V-NEXT:    slli a0, a0, 4
 ; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
 ; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    add a1, a1, a0
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
 ; RV32V-NEXT:    add a0, sp, a0
 ; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v16
-; RV32V-NEXT:    vand.vv v15, v15, v16
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v11, v15, v13
-; RV32V-NEXT:    vsll.vi v8, v8, 8
-; RV32V-NEXT:    vsll.vi v13, v14, 24
-; RV32V-NEXT:    vor.vv v8, v13, v8
-; RV32V-NEXT:    vor.vv v10, v10, v12
-; RV32V-NEXT:    vor.vv v9, v11, v9
-; RV32V-NEXT:    vor.vv v8, v10, v8
-; RV32V-NEXT:    vor.vv v8, v8, v9
-; RV32V-NEXT:    vsrl.vi v9, v8, 4
+; RV32V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32V-NEXT:    vand.vv v8, v8, v24
+; RV32V-NEXT:    vand.vv v16, v16, v24
+; RV32V-NEXT:    vadd.vv v8, v8, v8
+; RV32V-NEXT:    vor.vv v8, v16, v8
+; RV32V-NEXT:    vsrl.vi v8, v8, 1
 ; RV32V-NEXT:    csrr a0, vlenb
+; RV32V-NEXT:    slli a0, a0, 5
 ; RV32V-NEXT:    mv a1, a0
 ; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
+; RV32V-NEXT:    slli a0, a0, 2
 ; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
+; RV32V-NEXT:    add sp, sp, a0
+; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 368
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv8i64_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -16
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 5
+; RV64V-NEXT:    sub sp, sp, a1
+; RV64V-NEXT:    li a1, 56
+; RV64V-NEXT:    lui t2, 16
+; RV64V-NEXT:    lui a2, 4080
+; RV64V-NEXT:    li t0, 255
+; RV64V-NEXT:    lui a3, 61681
+; RV64V-NEXT:    lui a4, 209715
+; RV64V-NEXT:    lui a5, 349525
+; RV64V-NEXT:    srli a6, a0, 24
+; RV64V-NEXT:    srli a7, a0, 8
+; RV64V-NEXT:    srli t1, a0, 40
+; RV64V-NEXT:    srli t3, a0, 56
+; RV64V-NEXT:    addi a3, a3, -241
+; RV64V-NEXT:    addi a4, a4, 819
+; RV64V-NEXT:    addi t4, a5, 1365
+; RV64V-NEXT:    slli a5, a3, 32
+; RV64V-NEXT:    add a5, a3, a5
+; RV64V-NEXT:    slli a3, a4, 32
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, t4, 32
+; RV64V-NEXT:    add a3, t4, a3
+; RV64V-NEXT:    srliw t4, a0, 24
+; RV64V-NEXT:    slli t0, t0, 24
+; RV64V-NEXT:    and a6, a6, a2
+; RV64V-NEXT:    and a7, a7, t0
+; RV64V-NEXT:    or t5, a7, a6
+; RV64V-NEXT:    addi a6, t2, -256
+; RV64V-NEXT:    and a7, t1, a6
+; RV64V-NEXT:    or t1, a7, t3
+; RV64V-NEXT:    and a7, a0, a2
+; RV64V-NEXT:    slli t4, t4, 32
+; RV64V-NEXT:    slli a7, a7, 24
+; RV64V-NEXT:    or t3, a7, t4
+; RV64V-NEXT:    li a7, 40
+; RV64V-NEXT:    vsetvli t4, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vsrl.vi v24, v8, 24
+; RV64V-NEXT:    vsrl.vx v16, v8, a1
+; RV64V-NEXT:    vsrl.vx v0, v8, a7
+; RV64V-NEXT:    vand.vx v0, v0, a6
+; RV64V-NEXT:    vor.vv v16, v0, v16
+; RV64V-NEXT:    vsrl.vi v0, v8, 8
+; RV64V-NEXT:    or t1, t5, t1
+; RV64V-NEXT:    slli t4, a0, 56
+; RV64V-NEXT:    and a0, a0, a6
+; RV64V-NEXT:    slli a0, a0, 40
+; RV64V-NEXT:    or t4, t4, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    or t4, t4, t3
+; RV64V-NEXT:    lui t3, 1
+; RV64V-NEXT:    vand.vx v24, v24, a2
+; RV64V-NEXT:    vand.vx v0, v0, t0
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    vand.vx v0, v8, a2
+; RV64V-NEXT:    vsll.vi v0, v0, 24
+; RV64V-NEXT:    vor.vv v16, v24, v16
+; RV64V-NEXT:    vand.vx v24, v8, t0
+; RV64V-NEXT:    vsll.vi v24, v24, 8
+; RV64V-NEXT:    vor.vv v24, v0, v24
+; RV64V-NEXT:    vsll.vx v0, v8, a1
+; RV64V-NEXT:    vand.vx v8, v8, a6
+; RV64V-NEXT:    vsll.vx v8, v8, a7
+; RV64V-NEXT:    vor.vv v8, v0, v8
+; RV64V-NEXT:    vor.vv v8, v8, v24
+; RV64V-NEXT:    vor.vv v8, v8, v16
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a5
+; RV64V-NEXT:    srli t4, t1, 4
+; RV64V-NEXT:    and t1, t1, a5
+; RV64V-NEXT:    vand.vx v16, v16, a5
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    and t4, t4, a5
+; RV64V-NEXT:    slli t1, t1, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a4
+; RV64V-NEXT:    srli t4, t1, 2
+; RV64V-NEXT:    and t1, t1, a4
+; RV64V-NEXT:    vand.vx v16, v16, a4
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    and t4, t4, a4
+; RV64V-NEXT:    slli t1, t1, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    srli t4, t1, 1
+; RV64V-NEXT:    and t1, t1, a3
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    and t4, t4, a3
+; RV64V-NEXT:    slli t1, t1, 1
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    or t1, t4, t1
+; RV64V-NEXT:    andi t4, t1, 2
+; RV64V-NEXT:    vmul.vx v16, v8, t4
+; RV64V-NEXT:    andi t4, t1, 1
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    andi t4, t1, 4
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    andi t4, t1, 8
+; RV64V-NEXT:    vxor.vv v16, v24, v16
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    andi t4, t1, 16
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    andi t4, t1, 32
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    andi t4, t1, 64
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    andi t4, t1, 128
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    andi t4, t1, 256
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    andi t4, t1, 512
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    csrr t5, vlenb
+; RV64V-NEXT:    slli t5, t5, 3
+; RV64V-NEXT:    mv t6, t5
+; RV64V-NEXT:    slli t5, t5, 1
+; RV64V-NEXT:    add t5, t5, t6
+; RV64V-NEXT:    add t5, sp, t5
+; RV64V-NEXT:    addi t5, t5, 16
+; RV64V-NEXT:    vs8r.v v16, (t5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    andi t4, t1, 1024
+; RV64V-NEXT:    vxor.vv v0, v16, v0
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    slli t4, a0, 11
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    lui t4, 2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    lui t3, 4
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    lui t4, 8
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    lui t3, 32
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    lui t4, 64
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t2
+; RV64V-NEXT:    lui t2, 128
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    lui t3, 256
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    lui t4, 512
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t2
+; RV64V-NEXT:    lui t2, 1024
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    lui t3, 2048
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t4
+; RV64V-NEXT:    lui t4, 4096
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t2
+; RV64V-NEXT:    lui t2, 8192
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    vxor.vv v0, v24, v0
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    vxor.vv v24, v0, v24
+; RV64V-NEXT:    vmul.vx v16, v8, t2
+; RV64V-NEXT:    vxor.vv v24, v24, v16
+; RV64V-NEXT:    lui t2, 16384
+; RV64V-NEXT:    lui t3, 32768
+; RV64V-NEXT:    lui t4, 65536
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    csrr t5, vlenb
+; RV64V-NEXT:    slli t5, t5, 3
+; RV64V-NEXT:    mv t6, t5
+; RV64V-NEXT:    slli t5, t5, 1
+; RV64V-NEXT:    add t5, t5, t6
+; RV64V-NEXT:    add t5, sp, t5
+; RV64V-NEXT:    addi t5, t5, 16
+; RV64V-NEXT:    vl8r.v v16, (t5) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vsll.vx v16, v16, a1
+; RV64V-NEXT:    vand.vx v0, v0, a6
+; RV64V-NEXT:    vsll.vx v0, v0, a7
+; RV64V-NEXT:    vor.vv v16, v16, v0
+; RV64V-NEXT:    csrr t5, vlenb
+; RV64V-NEXT:    slli t5, t5, 3
+; RV64V-NEXT:    add t5, sp, t5
+; RV64V-NEXT:    addi t5, t5, 16
+; RV64V-NEXT:    vs8r.v v16, (t5) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vx v16, v8, t2
+; RV64V-NEXT:    lui t2, 131072
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v24, v16
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    lui t3, 262144
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    slli t4, a0, 32
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    slli t2, a0, 33
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    srliw t3, t1, 31
+; RV64V-NEXT:    slli t3, t3, 31
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    slli t3, a0, 34
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    slli t4, a0, 35
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    slli t2, a0, 36
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    slli t3, a0, 37
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    slli t4, a0, 38
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    slli t2, a0, 39
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t3
+; RV64V-NEXT:    slli t3, a0, 40
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    slli t4, a0, 41
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    and t4, t1, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v24, v16, v24
+; RV64V-NEXT:    csrr t2, vlenb
+; RV64V-NEXT:    slli t2, t2, 3
+; RV64V-NEXT:    mv t5, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add t2, t2, t5
+; RV64V-NEXT:    add t2, sp, t2
+; RV64V-NEXT:    addi t2, t2, 16
+; RV64V-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    vmul.vx v16, v8, t3
+; RV64V-NEXT:    vxor.vv v16, v24, v16
+; RV64V-NEXT:    vmul.vx v24, v8, t4
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 42
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 43
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 44
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 45
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 46
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 47
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 48
+; RV64V-NEXT:    slli t3, a0, 49
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v24, v16, v24
+; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    slli t2, a0, 50
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v0, v8, t2
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    csrr t2, vlenb
+; RV64V-NEXT:    slli t2, t2, 3
+; RV64V-NEXT:    mv t3, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add t2, t2, t3
+; RV64V-NEXT:    add t2, sp, t2
+; RV64V-NEXT:    addi t2, t2, 16
+; RV64V-NEXT:    vl8r.v v0, (t2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vsrl.vi v0, v0, 8
+; RV64V-NEXT:    vand.vx v0, v0, t0
+; RV64V-NEXT:    vsrl.vi v16, v16, 24
+; RV64V-NEXT:    vand.vx v16, v16, a2
+; RV64V-NEXT:    vor.vv v16, v0, v16
+; RV64V-NEXT:    addi t2, sp, 16
+; RV64V-NEXT:    vs8r.v v16, (t2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    slli t2, a0, 51
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v16, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v24, v16
+; RV64V-NEXT:    slli t2, a0, 52
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 53
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 54
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    slli t2, a0, 55
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v16, v16, v24
+; RV64V-NEXT:    csrr t2, vlenb
+; RV64V-NEXT:    slli t2, t2, 4
+; RV64V-NEXT:    add t2, sp, t2
+; RV64V-NEXT:    addi t2, t2, 16
+; RV64V-NEXT:    vs8r.v v16, (t2) # vscale x 64-byte Folded Spill
+; RV64V-NEXT:    slli t2, a0, 56
+; RV64V-NEXT:    slli t3, a0, 57
+; RV64V-NEXT:    and t2, t1, t2
+; RV64V-NEXT:    and t3, t1, t3
+; RV64V-NEXT:    vmul.vx v24, v8, t2
+; RV64V-NEXT:    vxor.vv v24, v16, v24
+; RV64V-NEXT:    vmul.vx v0, v8, t3
+; RV64V-NEXT:    vxor.vv v24, v24, v0
+; RV64V-NEXT:    csrr t2, vlenb
+; RV64V-NEXT:    slli t2, t2, 3
+; RV64V-NEXT:    mv t3, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add t2, t2, t3
+; RV64V-NEXT:    add t2, sp, t2
+; RV64V-NEXT:    addi t2, t2, 16
+; RV64V-NEXT:    vl8r.v v16, (t2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vand.vx v0, v16, a2
+; RV64V-NEXT:    vsll.vi v0, v0, 24
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vand.vx v16, v16, t0
+; RV64V-NEXT:    vsll.vi v16, v16, 8
+; RV64V-NEXT:    vor.vv v16, v0, v16
+; RV64V-NEXT:    slli a2, a0, 58
+; RV64V-NEXT:    and a2, t1, a2
+; RV64V-NEXT:    vmul.vx v0, v8, a2
+; RV64V-NEXT:    vxor.vv v0, v24, v0
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 16
+; RV64V-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v24, v24, v16
+; RV64V-NEXT:    slli a2, a0, 59
+; RV64V-NEXT:    and a2, t1, a2
+; RV64V-NEXT:    vmul.vx v16, v8, a2
+; RV64V-NEXT:    vxor.vv v16, v0, v16
+; RV64V-NEXT:    slli a2, a0, 60
+; RV64V-NEXT:    and a2, t1, a2
+; RV64V-NEXT:    vmul.vx v0, v8, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    slli a2, a0, 61
+; RV64V-NEXT:    and a2, t1, a2
+; RV64V-NEXT:    vmul.vx v0, v8, a2
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    slli a0, a0, 62
+; RV64V-NEXT:    and a0, t1, a0
+; RV64V-NEXT:    vmul.vx v0, v8, a0
+; RV64V-NEXT:    vxor.vv v16, v16, v0
+; RV64V-NEXT:    srli a0, t1, 63
+; RV64V-NEXT:    slli a0, a0, 63
+; RV64V-NEXT:    vmul.vx v8, v8, a0
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 16
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vsrl.vx v16, v16, a7
+; RV64V-NEXT:    vand.vx v16, v16, a6
+; RV64V-NEXT:    vsrl.vx v8, v8, a1
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    addi a0, sp, 16
+; RV64V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vor.vv v8, v24, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, a5
+; RV64V-NEXT:    vand.vx v16, v16, a5
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, a4
+; RV64V-NEXT:    vand.vx v16, v16, a4
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v16, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, a3
+; RV64V-NEXT:    vand.vx v16, v16, a3
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v8, v16, v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    addi sp, sp, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv8i64_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    addi sp, sp, -16
+; RV32ZVBC-NEXT:    sw a0, 8(sp)
+; RV32ZVBC-NEXT:    sw a1, 12(sp)
+; RV32ZVBC-NEXT:    addi a0, sp, 8
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vlse64.v v16, (a0), zero
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v16
+; RV32ZVBC-NEXT:    addi sp, sp, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv8i64_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vclmulh.vx v8, v8, a0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i128 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %va.ext = zext <vscale x 8 x i64> %va to <vscale x 8 x i128>
+  %vb.ext = zext <vscale x 8 x i64> %vb to <vscale x 8 x i128>
+  %clmul = call <vscale x 8 x i128> @llvm.clmul.nxv8i128(<vscale x 8 x i128> %va.ext, <vscale x 8 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 8 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 8 x i128> %res.ext to <vscale x 8 x i64>
+  ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 1 x i64> @clmulh_nxv1i64_vv_mask(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %mask) {
+; RV32V-LABEL: clmulh_nxv1i64_vv_mask:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -336
+; RV32V-NEXT:    .cfi_def_cfa_offset 336
+; RV32V-NEXT:    sw ra, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 312(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 308(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 304(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 300(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 296(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 292(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 288(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 284(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    .cfi_offset ra, -4
+; RV32V-NEXT:    .cfi_offset s0, -8
+; RV32V-NEXT:    .cfi_offset s1, -12
+; RV32V-NEXT:    .cfi_offset s2, -16
+; RV32V-NEXT:    .cfi_offset s3, -20
+; RV32V-NEXT:    .cfi_offset s4, -24
+; RV32V-NEXT:    .cfi_offset s5, -28
+; RV32V-NEXT:    .cfi_offset s6, -32
+; RV32V-NEXT:    .cfi_offset s7, -36
+; RV32V-NEXT:    .cfi_offset s8, -40
+; RV32V-NEXT:    .cfi_offset s9, -44
+; RV32V-NEXT:    .cfi_offset s10, -48
+; RV32V-NEXT:    .cfi_offset s11, -52
+; RV32V-NEXT:    lui a0, 1044480
+; RV32V-NEXT:    lui t6, 524288
+; RV32V-NEXT:    li s11, 1
+; RV32V-NEXT:    li s6, 2
+; RV32V-NEXT:    li t5, 4
+; RV32V-NEXT:    li s8, 8
+; RV32V-NEXT:    li s10, 16
+; RV32V-NEXT:    li s9, 32
+; RV32V-NEXT:    li s1, 64
+; RV32V-NEXT:    li s2, 128
+; RV32V-NEXT:    li s3, 256
+; RV32V-NEXT:    li s4, 512
+; RV32V-NEXT:    li s5, 1024
+; RV32V-NEXT:    lui t4, 1
+; RV32V-NEXT:    lui t3, 2
+; RV32V-NEXT:    lui t2, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui s0, 16
+; RV32V-NEXT:    lui t0, 32
+; RV32V-NEXT:    lui a7, 64
+; RV32V-NEXT:    lui a6, 128
+; RV32V-NEXT:    lui a5, 256
+; RV32V-NEXT:    lui a4, 512
+; RV32V-NEXT:    lui a3, 1024
+; RV32V-NEXT:    lui a2, 2048
+; RV32V-NEXT:    lui a1, 4096
+; RV32V-NEXT:    lui s7, 8192
+; RV32V-NEXT:    lui ra, 16384
+; RV32V-NEXT:    sw a0, 248(sp)
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    sw zero, 252(sp)
+; RV32V-NEXT:    sw t6, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s11, 276(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s6, 260(sp)
+; RV32V-NEXT:    lui s6, 65536
+; RV32V-NEXT:    sw zero, 264(sp)
+; RV32V-NEXT:    sw t5, 268(sp)
+; RV32V-NEXT:    lui t5, 131072
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw s8, 244(sp)
+; RV32V-NEXT:    lui s8, 262144
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    sw s10, 236(sp)
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s9, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s1, 220(sp)
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s2, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s3, 204(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s4, 196(sp)
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s5, 188(sp)
+; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw s11, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw t4, 172(sp)
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw t3, 164(sp)
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw t2, 156(sp)
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw t1, 148(sp)
+; RV32V-NEXT:    lui t2, 8
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw s0, 140(sp)
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t0, 132(sp)
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw a7, 124(sp)
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw a6, 116(sp)
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw a5, 108(sp)
+; RV32V-NEXT:    lui t3, 256
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw a4, 100(sp)
+; RV32V-NEXT:    lui t4, 512
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw a3, 92(sp)
+; RV32V-NEXT:    lui a5, 1024
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a2, 84(sp)
+; RV32V-NEXT:    lui a4, 2048
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a1, 76(sp)
+; RV32V-NEXT:    lui a3, 4096
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw s7, 68(sp)
+; RV32V-NEXT:    lui t1, 8192
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw ra, 60(sp)
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw a0, 52(sp)
+; RV32V-NEXT:    lui s7, 32768
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw s6, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw t5, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw s8, 28(sp)
+; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    sw t6, 20(sp)
+; RV32V-NEXT:    lui a0, 61681
+; RV32V-NEXT:    addi a0, a0, -241
+; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    lui a0, 209715
+; RV32V-NEXT:    addi a0, a0, 819
+; RV32V-NEXT:    vmv.v.x v11, a0
+; RV32V-NEXT:    lui a0, 349525
+; RV32V-NEXT:    addi a0, a0, 1365
+; RV32V-NEXT:    vmv.v.x v12, a0
+; RV32V-NEXT:    addi a0, sp, 248
+; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV32V-NEXT:    vlse64.v v13, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 8
+; RV32V-NEXT:    vlse64.v v6, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 272
+; RV32V-NEXT:    vlse64.v v7, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 256
+; RV32V-NEXT:    vlse64.v v31, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 264
+; RV32V-NEXT:    vlse64.v v30, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 240
+; RV32V-NEXT:    vlse64.v v29, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 232
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 224
+; RV32V-NEXT:    vlse64.v v27, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 216
+; RV32V-NEXT:    vlse64.v v22, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 208
+; RV32V-NEXT:    vlse64.v v19, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 200
+; RV32V-NEXT:    vlse64.v v18, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 192
+; RV32V-NEXT:    vlse64.v v23, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 184
+; RV32V-NEXT:    vlse64.v v16, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 176
+; RV32V-NEXT:    vlse64.v v14, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 168
+; RV32V-NEXT:    vlse64.v v15, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 160
+; RV32V-NEXT:    vlse64.v v17, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 152
+; RV32V-NEXT:    vlse64.v v20, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 144
+; RV32V-NEXT:    vlse64.v v25, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 136
+; RV32V-NEXT:    vlse64.v v21, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 128
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 120
+; RV32V-NEXT:    vlse64.v v26, (a0), zero
+; RV32V-NEXT:    li ra, 56
+; RV32V-NEXT:    vsrl.vi v5, v9, 24
+; RV32V-NEXT:    vsrl.vi v4, v9, 8
+; RV32V-NEXT:    vsrl.vx v3, v9, ra
+; RV32V-NEXT:    li a2, 40
+; RV32V-NEXT:    vsrl.vx v2, v9, a2
+; RV32V-NEXT:    lui a1, 4080
+; RV32V-NEXT:    vand.vx v5, v5, a1
+; RV32V-NEXT:    vsll.vx v1, v9, ra
+; RV32V-NEXT:    addi a0, s0, -256
+; RV32V-NEXT:    vand.vx v2, v2, a0
+; RV32V-NEXT:    vor.vv v3, v2, v3
+; RV32V-NEXT:    vand.vx v2, v9, a0
+; RV32V-NEXT:    vsll.vx v2, v2, a2
+; RV32V-NEXT:    vor.vv v2, v1, v2
+; RV32V-NEXT:    vand.vx v1, v9, a1
+; RV32V-NEXT:    vsll.vi v1, v1, 24
+; RV32V-NEXT:    vand.vv v4, v4, v13
+; RV32V-NEXT:    vand.vv v9, v9, v13
+; RV32V-NEXT:    vor.vv v5, v4, v5
+; RV32V-NEXT:    vsll.vi v9, v9, 8
+; RV32V-NEXT:    vor.vv v5, v5, v3
+; RV32V-NEXT:    vor.vv v9, v1, v9
+; RV32V-NEXT:    vor.vv v9, v2, v9
+; RV32V-NEXT:    vor.vv v9, v9, v5
+; RV32V-NEXT:    vsrl.vi v5, v9, 4
 ; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 4
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 2
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
+; RV32V-NEXT:    vand.vv v5, v5, v10
+; RV32V-NEXT:    vsll.vi v9, v9, 4
+; RV32V-NEXT:    vor.vv v9, v5, v9
+; RV32V-NEXT:    vsrl.vi v5, v9, 2
+; RV32V-NEXT:    vand.vv v9, v9, v11
+; RV32V-NEXT:    vand.vv v5, v5, v11
+; RV32V-NEXT:    vsll.vi v9, v9, 2
+; RV32V-NEXT:    vor.vv v9, v5, v9
+; RV32V-NEXT:    vsrl.vi v5, v9, 1
+; RV32V-NEXT:    vand.vv v9, v9, v12
+; RV32V-NEXT:    vand.vv v5, v5, v12
+; RV32V-NEXT:    vadd.vv v9, v9, v9
+; RV32V-NEXT:    vor.vv v9, v5, v9
+; RV32V-NEXT:    vand.vx v4, v9, s10
+; RV32V-NEXT:    vsrl.vi v5, v8, 24
+; RV32V-NEXT:    vsrl.vx v3, v8, ra
+; RV32V-NEXT:    vsrl.vx v2, v8, a2
+; RV32V-NEXT:    vsll.vx v1, v8, ra
+; RV32V-NEXT:    vand.vx v2, v2, a0
+; RV32V-NEXT:    vor.vv v3, v2, v3
+; RV32V-NEXT:    vand.vx v2, v8, a0
+; RV32V-NEXT:    vsll.vx v2, v2, a2
+; RV32V-NEXT:    vor.vv v2, v1, v2
+; RV32V-NEXT:    vsrl.vi v1, v8, 8
+; RV32V-NEXT:    vand.vx v5, v5, a1
+; RV32V-NEXT:    vand.vv v1, v1, v13
+; RV32V-NEXT:    vor.vv v5, v1, v5
+; RV32V-NEXT:    vand.vx v1, v8, a1
+; RV32V-NEXT:    vsll.vi v1, v1, 24
+; RV32V-NEXT:    vor.vv v5, v5, v3
+; RV32V-NEXT:    vand.vv v3, v8, v13
+; RV32V-NEXT:    vsll.vi v3, v3, 8
+; RV32V-NEXT:    vor.vv v3, v1, v3
+; RV32V-NEXT:    vand.vx v1, v9, s9
+; RV32V-NEXT:    vor.vv v3, v2, v3
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vsrl.vi v3, v5, 4
+; RV32V-NEXT:    vand.vv v5, v5, v10
+; RV32V-NEXT:    vand.vv v3, v3, v10
+; RV32V-NEXT:    vsll.vi v5, v5, 4
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vsrl.vi v3, v5, 2
+; RV32V-NEXT:    vand.vv v5, v5, v11
+; RV32V-NEXT:    vand.vv v3, v3, v11
+; RV32V-NEXT:    vsll.vi v5, v5, 2
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vsrl.vi v3, v5, 1
+; RV32V-NEXT:    vand.vv v5, v5, v12
+; RV32V-NEXT:    vand.vv v3, v3, v12
+; RV32V-NEXT:    vadd.vv v5, v5, v5
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vand.vi v3, v9, 2
+; RV32V-NEXT:    vand.vi v2, v9, 1
+; RV32V-NEXT:    vmul.vv v3, v5, v3
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v2, v3
+; RV32V-NEXT:    vand.vi v2, v9, 4
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vi v2, v9, 8
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v9, s1
+; RV32V-NEXT:    vmul.vv v4, v5, v4
+; RV32V-NEXT:    vxor.vv v4, v3, v4
+; RV32V-NEXT:    vand.vx v3, v9, s2
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v4, v4, v1
+; RV32V-NEXT:    vand.vx v1, v9, s3
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v4, v4, v2
+; RV32V-NEXT:    vand.vx v2, v9, s4
+; RV32V-NEXT:    vmul.vv v3, v5, v3
+; RV32V-NEXT:    vxor.vv v4, v4, v3
+; RV32V-NEXT:    vand.vx v3, v9, s5
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v1, v4, v1
+; RV32V-NEXT:    vxor.vv v2, v1, v2
+; RV32V-NEXT:    vand.vx v1, v9, s11
+; RV32V-NEXT:    vmul.vv v3, v5, v3
+; RV32V-NEXT:    vxor.vv v3, v2, v3
+; RV32V-NEXT:    lui t6, 1
+; RV32V-NEXT:    vand.vx v2, v9, t6
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    lui t6, 2
+; RV32V-NEXT:    vand.vx v1, v9, t6
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    lui t6, 4
+; RV32V-NEXT:    vand.vx v2, v9, t6
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v9, t2
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v9, s0
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v9, t0
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v9, a7
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v9, a6
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v9, t3
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v9, t4
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v9, a5
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v9, a4
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v9, a3
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v9, t1
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v3, v2
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    lui a3, 16384
+; RV32V-NEXT:    vand.vx v1, v9, a3
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v9, s7
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v9, s6
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v9, t5
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v9, s8
+; RV32V-NEXT:    addi a3, sp, 112
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v1, v2, v1
+; RV32V-NEXT:    vlse64.v v2, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 104
+; RV32V-NEXT:    vand.vv v6, v9, v6
+; RV32V-NEXT:    vmul.vv v6, v5, v6
+; RV32V-NEXT:    vxor.vv v1, v1, v6
+; RV32V-NEXT:    vlse64.v v6, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 96
+; RV32V-NEXT:    vand.vv v7, v9, v7
+; RV32V-NEXT:    vmul.vv v7, v5, v7
+; RV32V-NEXT:    vxor.vv v1, v1, v7
+; RV32V-NEXT:    vlse64.v v7, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 88
+; RV32V-NEXT:    vand.vv v31, v9, v31
+; RV32V-NEXT:    vmul.vv v31, v5, v31
+; RV32V-NEXT:    vxor.vv v1, v1, v31
+; RV32V-NEXT:    vlse64.v v31, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 80
+; RV32V-NEXT:    vand.vv v30, v9, v30
+; RV32V-NEXT:    vmul.vv v30, v5, v30
+; RV32V-NEXT:    vxor.vv v1, v1, v30
+; RV32V-NEXT:    vlse64.v v30, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 72
+; RV32V-NEXT:    vand.vv v29, v9, v29
+; RV32V-NEXT:    vmul.vv v29, v5, v29
+; RV32V-NEXT:    vxor.vv v1, v1, v29
+; RV32V-NEXT:    vlse64.v v29, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 64
+; RV32V-NEXT:    vand.vv v28, v9, v28
+; RV32V-NEXT:    vmul.vv v28, v5, v28
+; RV32V-NEXT:    vxor.vv v1, v1, v28
+; RV32V-NEXT:    vlse64.v v28, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 56
+; RV32V-NEXT:    vand.vv v27, v9, v27
+; RV32V-NEXT:    vmul.vv v27, v5, v27
+; RV32V-NEXT:    vxor.vv v1, v1, v27
+; RV32V-NEXT:    vlse64.v v27, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 48
+; RV32V-NEXT:    vand.vv v22, v9, v22
+; RV32V-NEXT:    vmul.vv v22, v5, v22
+; RV32V-NEXT:    vxor.vv v1, v1, v22
+; RV32V-NEXT:    vlse64.v v22, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 40
+; RV32V-NEXT:    vand.vv v19, v9, v19
+; RV32V-NEXT:    vmul.vv v19, v5, v19
+; RV32V-NEXT:    vxor.vv v19, v1, v19
+; RV32V-NEXT:    vlse64.v v1, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 32
+; RV32V-NEXT:    vand.vv v18, v9, v18
+; RV32V-NEXT:    vand.vv v23, v9, v23
+; RV32V-NEXT:    vmul.vv v18, v5, v18
+; RV32V-NEXT:    vmul.vv v23, v5, v23
+; RV32V-NEXT:    vxor.vv v18, v19, v18
+; RV32V-NEXT:    vxor.vv v23, v18, v23
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 24
+; RV32V-NEXT:    vand.vv v16, v9, v16
+; RV32V-NEXT:    vmul.vv v16, v5, v16
+; RV32V-NEXT:    vxor.vv v16, v23, v16
+; RV32V-NEXT:    vlse64.v v23, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    vand.vv v14, v9, v14
+; RV32V-NEXT:    vmul.vv v14, v5, v14
+; RV32V-NEXT:    vxor.vv v14, v16, v14
+; RV32V-NEXT:    vlse64.v v16, (a3), zero
+; RV32V-NEXT:    vand.vv v15, v9, v15
+; RV32V-NEXT:    vmul.vv v15, v5, v15
+; RV32V-NEXT:    vxor.vv v14, v14, v15
+; RV32V-NEXT:    vand.vv v15, v9, v17
+; RV32V-NEXT:    vmul.vv v15, v5, v15
+; RV32V-NEXT:    vxor.vv v14, v14, v15
+; RV32V-NEXT:    vand.vv v15, v9, v20
+; RV32V-NEXT:    vand.vv v17, v9, v25
+; RV32V-NEXT:    vmul.vv v15, v5, v15
+; RV32V-NEXT:    vmul.vv v17, v5, v17
+; RV32V-NEXT:    vxor.vv v14, v14, v15
+; RV32V-NEXT:    vand.vx v15, v19, a1
+; RV32V-NEXT:    vxor.vv v14, v14, v17
+; RV32V-NEXT:    vsrl.vi v17, v14, 24
+; RV32V-NEXT:    vand.vx v17, v17, a1
+; RV32V-NEXT:    vand.vv v20, v9, v21
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v9, v24
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v9, v26
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v9, v2
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v9, v6
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v9, v7
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v9, v31
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v9, v30
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vand.vx v21, v3, a0
+; RV32V-NEXT:    vsll.vx v21, v21, a2
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vsrl.vx v20, v14, a2
+; RV32V-NEXT:    vand.vx v20, v20, a0
+; RV32V-NEXT:    vand.vv v24, v9, v29
+; RV32V-NEXT:    vand.vv v25, v9, v28
+; RV32V-NEXT:    vand.vv v26, v9, v27
+; RV32V-NEXT:    vand.vv v22, v9, v22
+; RV32V-NEXT:    vand.vv v27, v9, v1
+; RV32V-NEXT:    vand.vv v18, v9, v18
+; RV32V-NEXT:    vand.vv v23, v9, v23
+; RV32V-NEXT:    vand.vv v9, v9, v16
+; RV32V-NEXT:    vmul.vv v16, v5, v24
+; RV32V-NEXT:    vmul.vv v24, v5, v25
+; RV32V-NEXT:    vmul.vv v25, v5, v26
+; RV32V-NEXT:    vmul.vv v22, v5, v22
+; RV32V-NEXT:    vmul.vv v26, v5, v27
+; RV32V-NEXT:    vmul.vv v18, v5, v18
+; RV32V-NEXT:    vmul.vv v23, v5, v23
+; RV32V-NEXT:    vmul.vv v9, v5, v9
+; RV32V-NEXT:    vxor.vv v16, v14, v16
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v25
+; RV32V-NEXT:    vxor.vv v16, v16, v22
+; RV32V-NEXT:    vxor.vv v16, v16, v26
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    vxor.vv v16, v16, v23
+; RV32V-NEXT:    vxor.vv v9, v16, v9
+; RV32V-NEXT:    vsll.vx v16, v4, ra
+; RV32V-NEXT:    vsrl.vx v9, v9, ra
+; RV32V-NEXT:    vor.vv v16, v16, v21
+; RV32V-NEXT:    vsrl.vi v18, v19, 8
+; RV32V-NEXT:    vsll.vi v15, v15, 24
+; RV32V-NEXT:    vand.vv v18, v18, v13
+; RV32V-NEXT:    vor.vv v17, v18, v17
+; RV32V-NEXT:    vand.vv v13, v14, v13
+; RV32V-NEXT:    vsll.vi v13, v13, 8
+; RV32V-NEXT:    vor.vv v13, v15, v13
+; RV32V-NEXT:    vor.vv v13, v16, v13
+; RV32V-NEXT:    vor.vv v9, v20, v9
+; RV32V-NEXT:    vor.vv v9, v17, v9
+; RV32V-NEXT:    vor.vv v9, v13, v9
+; RV32V-NEXT:    vsrl.vi v13, v9, 4
 ; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vsll.vi v8, v8, 2
-; RV32V-NEXT:    vor.vv v8, v9, v8
-; RV32V-NEXT:    vsrl.vi v9, v8, 1
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 4
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    vand.vv v8, v8, v10
+; RV32V-NEXT:    vand.vv v10, v13, v10
+; RV32V-NEXT:    vsll.vi v9, v9, 4
+; RV32V-NEXT:    vor.vv v9, v10, v9
+; RV32V-NEXT:    vsrl.vi v10, v9, 2
+; RV32V-NEXT:    vand.vv v9, v9, v11
+; RV32V-NEXT:    vand.vv v10, v10, v11
+; RV32V-NEXT:    vsll.vi v9, v9, 2
+; RV32V-NEXT:    vor.vv v9, v10, v9
+; RV32V-NEXT:    vsrl.vi v10, v9, 1
+; RV32V-NEXT:    vand.vv v9, v9, v12
+; RV32V-NEXT:    vand.vv v10, v10, v12
+; RV32V-NEXT:    vadd.vv v9, v9, v9
+; RV32V-NEXT:    vor.vv v9, v10, v9
+; RV32V-NEXT:    vsrl.vi v8, v9, 1, v0.t
+; RV32V-NEXT:    lw ra, 332(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 328(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 324(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 320(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 312(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 308(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 304(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 300(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 296(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 292(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 288(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 284(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    .cfi_restore ra
+; RV32V-NEXT:    .cfi_restore s0
+; RV32V-NEXT:    .cfi_restore s1
+; RV32V-NEXT:    .cfi_restore s2
+; RV32V-NEXT:    .cfi_restore s3
+; RV32V-NEXT:    .cfi_restore s4
+; RV32V-NEXT:    .cfi_restore s5
+; RV32V-NEXT:    .cfi_restore s6
+; RV32V-NEXT:    .cfi_restore s7
+; RV32V-NEXT:    .cfi_restore s8
+; RV32V-NEXT:    .cfi_restore s9
+; RV32V-NEXT:    .cfi_restore s10
+; RV32V-NEXT:    .cfi_restore s11
+; RV32V-NEXT:    addi sp, sp, 336
+; RV32V-NEXT:    .cfi_def_cfa_offset 0
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv1i64_vv_mask:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -224
+; RV64V-NEXT:    .cfi_def_cfa_offset 224
+; RV64V-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    .cfi_offset ra, -8
+; RV64V-NEXT:    .cfi_offset s0, -16
+; RV64V-NEXT:    .cfi_offset s1, -24
+; RV64V-NEXT:    .cfi_offset s2, -32
+; RV64V-NEXT:    .cfi_offset s3, -40
+; RV64V-NEXT:    .cfi_offset s4, -48
+; RV64V-NEXT:    .cfi_offset s5, -56
+; RV64V-NEXT:    .cfi_offset s6, -64
+; RV64V-NEXT:    .cfi_offset s7, -72
+; RV64V-NEXT:    .cfi_offset s8, -80
+; RV64V-NEXT:    .cfi_offset s9, -88
+; RV64V-NEXT:    .cfi_offset s10, -96
+; RV64V-NEXT:    .cfi_offset s11, -104
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    sub sp, sp, a0
+; RV64V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0x2f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 224 + 47 * vlenb
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV64V-NEXT:    vmv1r.v v18, v8
+; RV64V-NEXT:    li a3, 56
+; RV64V-NEXT:    lui a2, 16
+; RV64V-NEXT:    vsrl.vi v16, v8, 24
+; RV64V-NEXT:    vsrl.vi v10, v8, 8
+; RV64V-NEXT:    li t5, 255
+; RV64V-NEXT:    lui a0, 61681
+; RV64V-NEXT:    lui a1, 209715
+; RV64V-NEXT:    lui a5, 349525
+; RV64V-NEXT:    vsrl.vi v12, v9, 24
+; RV64V-NEXT:    vsrl.vi v11, v9, 8
+; RV64V-NEXT:    li ra, 16
+; RV64V-NEXT:    li s11, 32
+; RV64V-NEXT:    li s10, 64
+; RV64V-NEXT:    li s8, 128
+; RV64V-NEXT:    li s9, 256
+; RV64V-NEXT:    li s7, 512
+; RV64V-NEXT:    li s6, 1024
+; RV64V-NEXT:    li t0, 1
+; RV64V-NEXT:    lui s5, 1
+; RV64V-NEXT:    lui a6, 2
+; RV64V-NEXT:    lui a7, 4
+; RV64V-NEXT:    lui t1, 8
+; RV64V-NEXT:    lui t2, 32
+; RV64V-NEXT:    lui t3, 64
+; RV64V-NEXT:    lui t4, 128
+; RV64V-NEXT:    lui s3, 256
+; RV64V-NEXT:    lui s4, 512
+; RV64V-NEXT:    addi s0, a0, -241
+; RV64V-NEXT:    addi s1, a1, 819
+; RV64V-NEXT:    addi s2, a5, 1365
+; RV64V-NEXT:    slli a0, s0, 32
+; RV64V-NEXT:    add s0, s0, a0
+; RV64V-NEXT:    slli a0, s1, 32
+; RV64V-NEXT:    add s1, s1, a0
+; RV64V-NEXT:    slli a0, s2, 32
+; RV64V-NEXT:    add s2, s2, a0
+; RV64V-NEXT:    addi t6, a2, -256
+; RV64V-NEXT:    slli t5, t5, 24
+; RV64V-NEXT:    vsrl.vx v8, v9, a3
+; RV64V-NEXT:    li a0, 40
+; RV64V-NEXT:    vsrl.vx v13, v9, a0
+; RV64V-NEXT:    lui a1, 4080
+; RV64V-NEXT:    vand.vx v12, v12, a1
+; RV64V-NEXT:    vand.vx v14, v9, a1
+; RV64V-NEXT:    vsll.vx v15, v9, a3
+; RV64V-NEXT:    vand.vx v13, v13, t6
+; RV64V-NEXT:    vand.vx v11, v11, t5
+; RV64V-NEXT:    vsll.vi v14, v14, 24
+; RV64V-NEXT:    vand.vx v17, v9, t5
+; RV64V-NEXT:    vand.vx v9, v9, t6
+; RV64V-NEXT:    vor.vv v8, v13, v8
+; RV64V-NEXT:    vor.vv v11, v11, v12
+; RV64V-NEXT:    vsll.vi v12, v17, 8
+; RV64V-NEXT:    vsll.vx v9, v9, a0
+; RV64V-NEXT:    li a4, 40
+; RV64V-NEXT:    vor.vv v8, v11, v8
+; RV64V-NEXT:    vor.vv v11, v14, v12
+; RV64V-NEXT:    vor.vv v9, v15, v9
+; RV64V-NEXT:    vor.vv v9, v9, v11
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, s0
+; RV64V-NEXT:    vand.vx v9, v9, s0
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, s1
+; RV64V-NEXT:    vand.vx v9, v9, s1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, s2
+; RV64V-NEXT:    vand.vx v9, v9, s2
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v11, v9, v8
+; RV64V-NEXT:    vand.vx v13, v11, ra
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vand.vx v14, v11, s11
+; RV64V-NEXT:    lui a1, 8192
+; RV64V-NEXT:    vand.vx v15, v11, s10
+; RV64V-NEXT:    lui a3, 16384
+; RV64V-NEXT:    vand.vx v17, v11, s8
+; RV64V-NEXT:    lui s8, 32768
+; RV64V-NEXT:    vand.vx v19, v11, s9
+; RV64V-NEXT:    lui s9, 65536
+; RV64V-NEXT:    vand.vx v20, v11, s7
+; RV64V-NEXT:    lui s11, 131072
+; RV64V-NEXT:    vand.vx v21, v11, s6
+; RV64V-NEXT:    slli a5, t0, 11
+; RV64V-NEXT:    vand.vx v22, v11, a5
+; RV64V-NEXT:    lui ra, 262144
+; RV64V-NEXT:    li a5, 56
+; RV64V-NEXT:    vsrl.vx v5, v18, a5
+; RV64V-NEXT:    vsrl.vx v1, v18, a4
+; RV64V-NEXT:    lui s6, 4080
+; RV64V-NEXT:    vand.vx v2, v16, s6
+; RV64V-NEXT:    vand.vx v8, v18, s6
+; RV64V-NEXT:    vsll.vx v4, v18, a5
+; RV64V-NEXT:    vand.vx v23, v11, s5
+; RV64V-NEXT:    slli s10, t0, 31
+; RV64V-NEXT:    vand.vx v24, v11, a6
+; RV64V-NEXT:    slli a5, t0, 32
+; RV64V-NEXT:    sd a5, 96(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v25, v11, a7
+; RV64V-NEXT:    slli a5, t0, 33
+; RV64V-NEXT:    sd a5, 88(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v26, v11, t1
+; RV64V-NEXT:    slli a5, t0, 34
+; RV64V-NEXT:    sd a5, 80(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v27, v11, a2
+; RV64V-NEXT:    slli a2, t0, 35
+; RV64V-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v28, v11, t2
+; RV64V-NEXT:    slli a2, t0, 36
+; RV64V-NEXT:    sd a2, 64(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v29, v11, t3
+; RV64V-NEXT:    slli a2, t0, 37
+; RV64V-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v30, v11, t4
+; RV64V-NEXT:    slli a2, t0, 38
+; RV64V-NEXT:    sd a2, 48(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v31, v11, s3
+; RV64V-NEXT:    slli a2, t0, 39
+; RV64V-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v7, v11, s4
+; RV64V-NEXT:    slli a2, t0, 40
+; RV64V-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    lui a2, 1024
+; RV64V-NEXT:    vand.vx v9, v11, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    mv a5, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a5, a5, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli a2, t0, 41
+; RV64V-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    lui a2, 2048
+; RV64V-NEXT:    vand.vx v9, v11, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a5, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a5, a5, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a5, a5, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vs1r.v v9, (a2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s4, t0, 42
+; RV64V-NEXT:    vand.vx v9, v11, a0
+; RV64V-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a2, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a2, a2, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s5, t0, 43
+; RV64V-NEXT:    vand.vx v9, v11, a1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s6, t0, 44
+; RV64V-NEXT:    vand.vx v9, v11, a3
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s7, t0, 45
+; RV64V-NEXT:    vand.vx v9, v1, t6
+; RV64V-NEXT:    vor.vv v9, v9, v5
+; RV64V-NEXT:    vand.vx v12, v11, s8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s8, t0, 46
+; RV64V-NEXT:    vand.vx v10, v10, t5
+; RV64V-NEXT:    vor.vv v10, v10, v2
+; RV64V-NEXT:    vand.vx v12, v11, s9
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s9, t0, 47
+; RV64V-NEXT:    vsll.vi v8, v8, 24
+; RV64V-NEXT:    vor.vv v9, v10, v9
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v10, v18, t5
+; RV64V-NEXT:    vsll.vi v10, v10, 8
+; RV64V-NEXT:    vor.vv v8, v8, v10
+; RV64V-NEXT:    vand.vx v10, v18, t6
+; RV64V-NEXT:    vsll.vx v10, v10, a4
+; RV64V-NEXT:    vor.vv v10, v4, v10
+; RV64V-NEXT:    vand.vx v12, v11, s11
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli s11, t0, 48
+; RV64V-NEXT:    vor.vv v8, v10, v8
+; RV64V-NEXT:    vand.vx v10, v11, ra
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli ra, t0, 49
+; RV64V-NEXT:    vor.vv v8, v8, v9
+; RV64V-NEXT:    vsrl.vi v9, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, s0
+; RV64V-NEXT:    vand.vx v9, v9, s0
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, s1
+; RV64V-NEXT:    vand.vx v9, v9, s1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, s2
+; RV64V-NEXT:    vand.vx v9, v9, s2
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v4, v9, v8
+; RV64V-NEXT:    vand.vx v8, v11, s10
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    slli t4, t0, 50
+; RV64V-NEXT:    slli t3, t0, 51
+; RV64V-NEXT:    slli t2, t0, 52
+; RV64V-NEXT:    slli s10, t0, 53
+; RV64V-NEXT:    slli t1, t0, 54
+; RV64V-NEXT:    slli a7, t0, 55
+; RV64V-NEXT:    slli a6, t0, 56
+; RV64V-NEXT:    slli a5, t0, 57
+; RV64V-NEXT:    slli a4, t0, 58
+; RV64V-NEXT:    slli a2, t0, 59
+; RV64V-NEXT:    slli a1, t0, 60
+; RV64V-NEXT:    slli a3, t0, 61
+; RV64V-NEXT:    slli t0, t0, 62
+; RV64V-NEXT:    li a0, -1
+; RV64V-NEXT:    slli a0, a0, 63
+; RV64V-NEXT:    ld s3, 96(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 4
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 88(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s4, s3, 5
+; RV64V-NEXT:    add s3, s4, s3
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 80(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 5
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s4, s3, 5
+; RV64V-NEXT:    sub s3, s4, s3
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 56(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 48(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 40(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 32(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    vand.vx v8, v11, s3
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 3
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s4
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 3
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s5
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s6
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s7
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s8
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 2
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s9
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    add s4, s4, s3
+; RV64V-NEXT:    slli s3, s3, 3
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s11
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s3, s3, 1
+; RV64V-NEXT:    mv s4, s3
+; RV64V-NEXT:    slli s3, s3, 3
+; RV64V-NEXT:    add s3, s3, s4
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, ra
+; RV64V-NEXT:    csrr s3, vlenb
+; RV64V-NEXT:    slli s4, s3, 4
+; RV64V-NEXT:    add s3, s4, s3
+; RV64V-NEXT:    add s3, sp, s3
+; RV64V-NEXT:    addi s3, s3, 112
+; RV64V-NEXT:    vs1r.v v8, (s3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, t4
+; RV64V-NEXT:    csrr t4, vlenb
+; RV64V-NEXT:    slli t4, t4, 4
+; RV64V-NEXT:    add t4, sp, t4
+; RV64V-NEXT:    addi t4, t4, 112
+; RV64V-NEXT:    vs1r.v v8, (t4) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, t3
+; RV64V-NEXT:    csrr t3, vlenb
+; RV64V-NEXT:    slli t4, t3, 4
+; RV64V-NEXT:    sub t3, t4, t3
+; RV64V-NEXT:    add t3, sp, t3
+; RV64V-NEXT:    addi t3, t3, 112
+; RV64V-NEXT:    vs1r.v v8, (t3) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, t2
+; RV64V-NEXT:    csrr t2, vlenb
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    mv t3, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add t3, t3, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add t2, t2, t3
+; RV64V-NEXT:    add t2, sp, t2
+; RV64V-NEXT:    addi t2, t2, 112
+; RV64V-NEXT:    vs1r.v v8, (t2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, s10
+; RV64V-NEXT:    csrr t2, vlenb
+; RV64V-NEXT:    mv t3, t2
+; RV64V-NEXT:    slli t2, t2, 2
+; RV64V-NEXT:    add t3, t3, t2
+; RV64V-NEXT:    slli t2, t2, 1
+; RV64V-NEXT:    add t2, t2, t3
+; RV64V-NEXT:    add t2, sp, t2
+; RV64V-NEXT:    addi t2, t2, 112
+; RV64V-NEXT:    vs1r.v v8, (t2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, t1
+; RV64V-NEXT:    csrr t1, vlenb
+; RV64V-NEXT:    slli t1, t1, 2
+; RV64V-NEXT:    mv t2, t1
+; RV64V-NEXT:    slli t1, t1, 1
+; RV64V-NEXT:    add t1, t1, t2
+; RV64V-NEXT:    add t1, sp, t1
+; RV64V-NEXT:    addi t1, t1, 112
+; RV64V-NEXT:    vs1r.v v8, (t1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, a7
+; RV64V-NEXT:    csrr a7, vlenb
+; RV64V-NEXT:    mv t1, a7
+; RV64V-NEXT:    slli a7, a7, 1
+; RV64V-NEXT:    add t1, t1, a7
+; RV64V-NEXT:    slli a7, a7, 2
+; RV64V-NEXT:    add a7, a7, t1
+; RV64V-NEXT:    add a7, sp, a7
+; RV64V-NEXT:    addi a7, a7, 112
+; RV64V-NEXT:    vs1r.v v8, (a7) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, a6
+; RV64V-NEXT:    csrr a6, vlenb
+; RV64V-NEXT:    slli a6, a6, 1
+; RV64V-NEXT:    mv a7, a6
+; RV64V-NEXT:    slli a6, a6, 2
+; RV64V-NEXT:    add a6, a6, a7
+; RV64V-NEXT:    add a6, sp, a6
+; RV64V-NEXT:    addi a6, a6, 112
+; RV64V-NEXT:    vs1r.v v8, (a6) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, a5
+; RV64V-NEXT:    csrr a5, vlenb
+; RV64V-NEXT:    slli a6, a5, 3
+; RV64V-NEXT:    add a5, a6, a5
+; RV64V-NEXT:    add a5, sp, a5
+; RV64V-NEXT:    addi a5, a5, 112
+; RV64V-NEXT:    vs1r.v v8, (a5) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, a4
+; RV64V-NEXT:    csrr a4, vlenb
+; RV64V-NEXT:    slli a4, a4, 3
+; RV64V-NEXT:    add a4, sp, a4
+; RV64V-NEXT:    addi a4, a4, 112
+; RV64V-NEXT:    vs1r.v v8, (a4) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, a2
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a4, a2, 3
+; RV64V-NEXT:    sub a2, a4, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vs1r.v v8, (a2) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vx v8, v11, a1
+; RV64V-NEXT:    csrr a1, vlenb
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    mv a2, a1
+; RV64V-NEXT:    slli a1, a1, 1
+; RV64V-NEXT:    add a1, a1, a2
+; RV64V-NEXT:    add a1, sp, a1
+; RV64V-NEXT:    addi a1, a1, 112
+; RV64V-NEXT:    vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vand.vi v8, v11, 2
+; RV64V-NEXT:    vand.vi v9, v11, 1
+; RV64V-NEXT:    vand.vi v10, v11, 4
+; RV64V-NEXT:    vand.vi v12, v11, 8
+; RV64V-NEXT:    vand.vx v6, v11, a3
+; RV64V-NEXT:    vand.vx v5, v11, t0
+; RV64V-NEXT:    vand.vx v2, v11, a0
+; RV64V-NEXT:    vmul.vv v3, v4, v8
+; RV64V-NEXT:    vmul.vv v8, v4, v9
+; RV64V-NEXT:    vmul.vv v9, v4, v10
+; RV64V-NEXT:    vmul.vv v10, v4, v12
+; RV64V-NEXT:    vmul.vv v11, v4, v13
+; RV64V-NEXT:    vmul.vv v12, v4, v14
+; RV64V-NEXT:    vmul.vv v13, v4, v15
+; RV64V-NEXT:    vmul.vv v14, v4, v17
+; RV64V-NEXT:    vmul.vv v15, v4, v19
+; RV64V-NEXT:    vmul.vv v16, v4, v20
+; RV64V-NEXT:    vmul.vv v17, v4, v21
+; RV64V-NEXT:    vmul.vv v18, v4, v22
+; RV64V-NEXT:    vmul.vv v19, v4, v23
+; RV64V-NEXT:    vmul.vv v20, v4, v24
+; RV64V-NEXT:    vmul.vv v21, v4, v25
+; RV64V-NEXT:    vmul.vv v22, v4, v26
+; RV64V-NEXT:    vmul.vv v23, v4, v27
+; RV64V-NEXT:    vmul.vv v24, v4, v28
+; RV64V-NEXT:    vmul.vv v25, v4, v29
+; RV64V-NEXT:    vmul.vv v26, v4, v30
+; RV64V-NEXT:    vmul.vv v27, v4, v31
+; RV64V-NEXT:    vmul.vv v28, v4, v7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v29, v4, v29
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v30, v4, v30
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v31, v4, v31
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v7, v4, v7
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 2
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 1
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v0
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v0
+; RV64V-NEXT:    addi a0, sp, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v0, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 5
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 5
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 4
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 4
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 4
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 4
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 3
+; RV64V-NEXT:    add a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 3
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a1, a0, 3
+; RV64V-NEXT:    sub a0, a1, a0
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vmul.vv v1, v4, v1
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; RV64V-NEXT:    vmul.vv v6, v4, v6
+; RV64V-NEXT:    vmul.vv v5, v4, v5
+; RV64V-NEXT:    vmul.vv v4, v4, v2
+; RV64V-NEXT:    vxor.vv v8, v8, v3
+; RV64V-NEXT:    vxor.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v8, v11
+; RV64V-NEXT:    vxor.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v8, v13
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vxor.vv v9, v8, v15
+; RV64V-NEXT:    vxor.vv v9, v9, v16
+; RV64V-NEXT:    vxor.vv v9, v9, v17
+; RV64V-NEXT:    vxor.vv v9, v9, v18
+; RV64V-NEXT:    vxor.vv v9, v9, v19
+; RV64V-NEXT:    vxor.vv v9, v9, v20
+; RV64V-NEXT:    vxor.vv v9, v9, v21
+; RV64V-NEXT:    vxor.vv v9, v9, v22
+; RV64V-NEXT:    vxor.vv v9, v9, v23
+; RV64V-NEXT:    vxor.vv v9, v9, v24
+; RV64V-NEXT:    vxor.vv v9, v9, v25
+; RV64V-NEXT:    vxor.vv v9, v9, v26
+; RV64V-NEXT:    vxor.vv v9, v9, v27
+; RV64V-NEXT:    vxor.vv v9, v9, v28
+; RV64V-NEXT:    vxor.vv v9, v9, v29
+; RV64V-NEXT:    vxor.vv v9, v9, v30
+; RV64V-NEXT:    vxor.vv v10, v9, v31
+; RV64V-NEXT:    vxor.vv v10, v10, v7
+; RV64V-NEXT:    li a1, 56
+; RV64V-NEXT:    vsll.vx v8, v8, a1
+; RV64V-NEXT:    vand.vx v9, v9, t6
+; RV64V-NEXT:    li a0, 40
+; RV64V-NEXT:    vsll.vx v9, v9, a0
+; RV64V-NEXT:    vor.vv v8, v8, v9
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 2
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v9, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v10, v9
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 1
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    addi a2, sp, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    vxor.vv v9, v9, v0
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 4
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 5
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 5
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 5
+; RV64V-NEXT:    sub a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v10, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v9, v10
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v11, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v10, v11
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a3, a2, 4
+; RV64V-NEXT:    add a2, a3, a2
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    vsrl.vi v12, v9, 8
+; RV64V-NEXT:    vand.vx v12, v12, t5
+; RV64V-NEXT:    vsrl.vi v10, v10, 24
+; RV64V-NEXT:    lui a2, 4080
+; RV64V-NEXT:    vand.vx v10, v10, a2
+; RV64V-NEXT:    vor.vv v10, v12, v10
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 112
+; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 112
+; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 112
+; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    slli a3, a3, 2
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 3
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 112
+; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    csrr a3, vlenb
+; RV64V-NEXT:    mv a4, a3
+; RV64V-NEXT:    slli a3, a3, 1
+; RV64V-NEXT:    add a4, a4, a3
+; RV64V-NEXT:    slli a3, a3, 4
+; RV64V-NEXT:    add a3, a3, a4
+; RV64V-NEXT:    add a3, sp, a3
+; RV64V-NEXT:    addi a3, a3, 112
+; RV64V-NEXT:    vl1r.v v12, (a3) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v11, v11, v12
+; RV64V-NEXT:    vand.vx v9, v9, a2
+; RV64V-NEXT:    vsll.vi v9, v9, 24
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v11, v12
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 3
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vand.vx v13, v11, t5
+; RV64V-NEXT:    vsll.vi v13, v13, 8
+; RV64V-NEXT:    vor.vv v9, v9, v13
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v12, v12, v13
+; RV64V-NEXT:    vor.vv v8, v8, v9
+; RV64V-NEXT:    csrr a2, vlenb
+; RV64V-NEXT:    mv a3, a2
+; RV64V-NEXT:    slli a2, a2, 1
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a3, a3, a2
+; RV64V-NEXT:    slli a2, a2, 2
+; RV64V-NEXT:    add a2, a2, a3
+; RV64V-NEXT:    add a2, sp, a2
+; RV64V-NEXT:    addi a2, a2, 112
+; RV64V-NEXT:    vl1r.v v9, (a2) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v12, v9
+; RV64V-NEXT:    vsrl.vx v11, v11, a0
+; RV64V-NEXT:    vand.vx v11, v11, t6
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    mv a2, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a2, a2, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a2
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vxor.vv v9, v9, v12
+; RV64V-NEXT:    vxor.vv v9, v9, v6
+; RV64V-NEXT:    vxor.vv v9, v9, v5
+; RV64V-NEXT:    vxor.vv v9, v9, v4
+; RV64V-NEXT:    vsrl.vx v9, v9, a1
+; RV64V-NEXT:    vor.vv v9, v11, v9
+; RV64V-NEXT:    vor.vv v9, v10, v9
+; RV64V-NEXT:    vor.vv v8, v8, v9
+; RV64V-NEXT:    vsrl.vi v9, v8, 4
+; RV64V-NEXT:    vand.vx v8, v8, s0
+; RV64V-NEXT:    vand.vx v9, v9, s0
+; RV64V-NEXT:    vsll.vi v8, v8, 4
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 2
+; RV64V-NEXT:    vand.vx v8, v8, s1
+; RV64V-NEXT:    vand.vx v9, v9, s1
+; RV64V-NEXT:    vsll.vi v8, v8, 2
+; RV64V-NEXT:    vor.vv v8, v9, v8
+; RV64V-NEXT:    vsrl.vi v9, v8, 1
+; RV64V-NEXT:    vand.vx v8, v8, s2
+; RV64V-NEXT:    vand.vx v9, v9, s2
+; RV64V-NEXT:    vadd.vv v8, v8, v8
+; RV64V-NEXT:    vor.vv v9, v9, v8
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add a0, sp, a0
+; RV64V-NEXT:    addi a0, a0, 112
+; RV64V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; RV64V-NEXT:    vsrl.vi v8, v9, 1, v0.t
+; RV64V-NEXT:    csrr a0, vlenb
+; RV64V-NEXT:    mv a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 1
+; RV64V-NEXT:    add a1, a1, a0
+; RV64V-NEXT:    slli a0, a0, 2
+; RV64V-NEXT:    add a0, a0, a1
+; RV64V-NEXT:    add sp, sp, a0
+; RV64V-NEXT:    .cfi_def_cfa sp, 224
+; RV64V-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    .cfi_restore ra
+; RV64V-NEXT:    .cfi_restore s0
+; RV64V-NEXT:    .cfi_restore s1
+; RV64V-NEXT:    .cfi_restore s2
+; RV64V-NEXT:    .cfi_restore s3
+; RV64V-NEXT:    .cfi_restore s4
+; RV64V-NEXT:    .cfi_restore s5
+; RV64V-NEXT:    .cfi_restore s6
+; RV64V-NEXT:    .cfi_restore s7
+; RV64V-NEXT:    .cfi_restore s8
+; RV64V-NEXT:    .cfi_restore s9
+; RV64V-NEXT:    .cfi_restore s10
+; RV64V-NEXT:    .cfi_restore s11
+; RV64V-NEXT:    addi sp, sp, 224
+; RV64V-NEXT:    .cfi_def_cfa_offset 0
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv1i64_vv_mask:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV32ZVBC-NEXT:    vclmulh.vv v8, v8, v9, v0.t
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv1i64_vv_mask:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV64ZVBC-NEXT:    vclmulh.vv v8, v8, v9, v0.t
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 1 x i64> %va to <vscale x 1 x i128>
+  %vb.ext = zext <vscale x 1 x i64> %vb to <vscale x 1 x i128>
+  %clmul = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %va.ext, <vscale x 1 x i128> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i128> %clmul, splat(i128 64)
+  %res = trunc <vscale x 1 x i128> %res.ext to <vscale x 1 x i64>
+  %sel = select <vscale x 1 x i1> %mask, <vscale x 1 x i64> %res, <vscale x 1 x i64> %va
+  ret <vscale x 1 x i64> %sel
+}
+
+define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %mask) {
+; RV32V-LABEL: clmulh_nxv1i64_vx_mask:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -336
+; RV32V-NEXT:    .cfi_def_cfa_offset 336
+; RV32V-NEXT:    sw ra, 332(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s0, 328(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 324(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 320(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 316(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 312(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 308(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 304(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 300(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s8, 296(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s9, 292(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s10, 288(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s11, 284(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    .cfi_offset ra, -4
+; RV32V-NEXT:    .cfi_offset s0, -8
+; RV32V-NEXT:    .cfi_offset s1, -12
+; RV32V-NEXT:    .cfi_offset s2, -16
+; RV32V-NEXT:    .cfi_offset s3, -20
+; RV32V-NEXT:    .cfi_offset s4, -24
+; RV32V-NEXT:    .cfi_offset s5, -28
+; RV32V-NEXT:    .cfi_offset s6, -32
+; RV32V-NEXT:    .cfi_offset s7, -36
+; RV32V-NEXT:    .cfi_offset s8, -40
+; RV32V-NEXT:    .cfi_offset s9, -44
+; RV32V-NEXT:    .cfi_offset s10, -48
+; RV32V-NEXT:    .cfi_offset s11, -52
+; RV32V-NEXT:    sw a0, 0(sp)
+; RV32V-NEXT:    sw a1, 4(sp)
+; RV32V-NEXT:    mv t6, sp
+; RV32V-NEXT:    lui s0, 1044480
+; RV32V-NEXT:    li s11, 1
+; RV32V-NEXT:    li s6, 2
+; RV32V-NEXT:    li s3, 4
+; RV32V-NEXT:    li s8, 8
+; RV32V-NEXT:    li s10, 32
+; RV32V-NEXT:    li s9, 64
+; RV32V-NEXT:    li s5, 128
+; RV32V-NEXT:    li s4, 256
+; RV32V-NEXT:    li s2, 512
+; RV32V-NEXT:    li s1, 1024
+; RV32V-NEXT:    lui ra, 1
+; RV32V-NEXT:    lui t5, 2
+; RV32V-NEXT:    lui t4, 4
+; RV32V-NEXT:    lui t3, 8
+; RV32V-NEXT:    lui t2, 16
+; RV32V-NEXT:    lui t1, 32
+; RV32V-NEXT:    lui t0, 64
+; RV32V-NEXT:    lui a7, 128
+; RV32V-NEXT:    lui a6, 256
+; RV32V-NEXT:    lui a5, 512
+; RV32V-NEXT:    lui a4, 1024
+; RV32V-NEXT:    lui a3, 2048
+; RV32V-NEXT:    lui a2, 4096
+; RV32V-NEXT:    lui a1, 8192
+; RV32V-NEXT:    vsetvli s7, zero, e64, m1, ta, ma
+; RV32V-NEXT:    vlse64.v v13, (t6), zero
+; RV32V-NEXT:    lui t6, 16384
+; RV32V-NEXT:    sw s0, 248(sp)
+; RV32V-NEXT:    lui s0, 32768
+; RV32V-NEXT:    sw zero, 252(sp)
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    sw a0, 8(sp)
+; RV32V-NEXT:    sw zero, 12(sp)
+; RV32V-NEXT:    sw zero, 272(sp)
+; RV32V-NEXT:    sw s11, 276(sp)
+; RV32V-NEXT:    sw zero, 256(sp)
+; RV32V-NEXT:    sw s6, 260(sp)
+; RV32V-NEXT:    lui s6, 65536
+; RV32V-NEXT:    sw zero, 264(sp)
+; RV32V-NEXT:    sw s3, 268(sp)
+; RV32V-NEXT:    lui s7, 131072
+; RV32V-NEXT:    sw zero, 240(sp)
+; RV32V-NEXT:    sw s8, 244(sp)
+; RV32V-NEXT:    lui s8, 262144
+; RV32V-NEXT:    sw zero, 232(sp)
+; RV32V-NEXT:    li s3, 16
+; RV32V-NEXT:    sw s3, 236(sp)
+; RV32V-NEXT:    li s3, 16
+; RV32V-NEXT:    sw zero, 224(sp)
+; RV32V-NEXT:    sw s10, 228(sp)
+; RV32V-NEXT:    sw zero, 216(sp)
+; RV32V-NEXT:    sw s9, 220(sp)
+; RV32V-NEXT:    li s10, 64
+; RV32V-NEXT:    sw zero, 208(sp)
+; RV32V-NEXT:    sw s5, 212(sp)
+; RV32V-NEXT:    sw zero, 200(sp)
+; RV32V-NEXT:    sw s4, 204(sp)
+; RV32V-NEXT:    sw zero, 192(sp)
+; RV32V-NEXT:    sw s2, 196(sp)
+; RV32V-NEXT:    li s4, 512
+; RV32V-NEXT:    sw zero, 184(sp)
+; RV32V-NEXT:    sw s1, 188(sp)
+; RV32V-NEXT:    li s2, 1024
+; RV32V-NEXT:    slli s11, s11, 11
+; RV32V-NEXT:    sw zero, 176(sp)
+; RV32V-NEXT:    sw s11, 180(sp)
+; RV32V-NEXT:    sw zero, 168(sp)
+; RV32V-NEXT:    sw ra, 172(sp)
+; RV32V-NEXT:    sw zero, 160(sp)
+; RV32V-NEXT:    sw t5, 164(sp)
+; RV32V-NEXT:    lui s5, 2
+; RV32V-NEXT:    sw zero, 152(sp)
+; RV32V-NEXT:    sw t4, 156(sp)
+; RV32V-NEXT:    lui s1, 4
+; RV32V-NEXT:    sw zero, 144(sp)
+; RV32V-NEXT:    sw t3, 148(sp)
+; RV32V-NEXT:    lui t4, 8
+; RV32V-NEXT:    sw zero, 136(sp)
+; RV32V-NEXT:    sw t2, 140(sp)
+; RV32V-NEXT:    lui t5, 16
+; RV32V-NEXT:    sw zero, 128(sp)
+; RV32V-NEXT:    sw t1, 132(sp)
+; RV32V-NEXT:    lui t3, 32
+; RV32V-NEXT:    sw zero, 120(sp)
+; RV32V-NEXT:    sw t0, 124(sp)
+; RV32V-NEXT:    lui t1, 64
+; RV32V-NEXT:    sw zero, 112(sp)
+; RV32V-NEXT:    sw a7, 116(sp)
+; RV32V-NEXT:    lui t2, 128
+; RV32V-NEXT:    sw zero, 104(sp)
+; RV32V-NEXT:    sw a6, 108(sp)
+; RV32V-NEXT:    sw zero, 96(sp)
+; RV32V-NEXT:    sw a5, 100(sp)
+; RV32V-NEXT:    lui t0, 512
+; RV32V-NEXT:    sw zero, 88(sp)
+; RV32V-NEXT:    sw a4, 92(sp)
+; RV32V-NEXT:    lui a7, 1024
+; RV32V-NEXT:    sw zero, 80(sp)
+; RV32V-NEXT:    sw a3, 84(sp)
+; RV32V-NEXT:    lui a4, 2048
+; RV32V-NEXT:    sw zero, 72(sp)
+; RV32V-NEXT:    sw a2, 76(sp)
+; RV32V-NEXT:    lui a5, 4096
+; RV32V-NEXT:    sw zero, 64(sp)
+; RV32V-NEXT:    sw a1, 68(sp)
+; RV32V-NEXT:    lui a3, 8192
+; RV32V-NEXT:    sw zero, 56(sp)
+; RV32V-NEXT:    sw t6, 60(sp)
+; RV32V-NEXT:    sw zero, 48(sp)
+; RV32V-NEXT:    sw s0, 52(sp)
+; RV32V-NEXT:    sw zero, 40(sp)
+; RV32V-NEXT:    sw s6, 44(sp)
+; RV32V-NEXT:    sw zero, 32(sp)
+; RV32V-NEXT:    sw s7, 36(sp)
+; RV32V-NEXT:    sw zero, 24(sp)
+; RV32V-NEXT:    sw s8, 28(sp)
+; RV32V-NEXT:    sw zero, 16(sp)
+; RV32V-NEXT:    sw a0, 20(sp)
+; RV32V-NEXT:    lui a0, 61681
+; RV32V-NEXT:    addi a0, a0, -241
+; RV32V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    lui a0, 209715
+; RV32V-NEXT:    addi a0, a0, 819
+; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    lui a0, 349525
+; RV32V-NEXT:    addi a0, a0, 1365
+; RV32V-NEXT:    vmv.v.x v11, a0
+; RV32V-NEXT:    addi a0, sp, 248
+; RV32V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV32V-NEXT:    vlse64.v v12, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 8
+; RV32V-NEXT:    vlse64.v v6, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 272
+; RV32V-NEXT:    vlse64.v v7, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 256
+; RV32V-NEXT:    vlse64.v v31, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 264
+; RV32V-NEXT:    vlse64.v v30, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 240
+; RV32V-NEXT:    vlse64.v v29, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 232
+; RV32V-NEXT:    vlse64.v v28, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 224
+; RV32V-NEXT:    vlse64.v v27, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 216
+; RV32V-NEXT:    vlse64.v v22, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 208
+; RV32V-NEXT:    vlse64.v v19, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 200
+; RV32V-NEXT:    vlse64.v v18, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 192
+; RV32V-NEXT:    vlse64.v v23, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 184
+; RV32V-NEXT:    vlse64.v v16, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 176
+; RV32V-NEXT:    vlse64.v v14, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 168
+; RV32V-NEXT:    vlse64.v v15, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 160
+; RV32V-NEXT:    vlse64.v v17, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 152
+; RV32V-NEXT:    vlse64.v v20, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 144
+; RV32V-NEXT:    vlse64.v v25, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 136
+; RV32V-NEXT:    vlse64.v v21, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 128
+; RV32V-NEXT:    vlse64.v v24, (a0), zero
+; RV32V-NEXT:    addi a0, sp, 120
+; RV32V-NEXT:    vlse64.v v26, (a0), zero
+; RV32V-NEXT:    li ra, 56
+; RV32V-NEXT:    vsrl.vx v5, v13, ra
+; RV32V-NEXT:    li a2, 40
+; RV32V-NEXT:    vsrl.vx v4, v13, a2
+; RV32V-NEXT:    vsrl.vi v3, v13, 24
+; RV32V-NEXT:    vsrl.vi v2, v13, 8
+; RV32V-NEXT:    vsll.vx v1, v13, ra
+; RV32V-NEXT:    addi a0, t5, -256
+; RV32V-NEXT:    vand.vx v4, v4, a0
+; RV32V-NEXT:    vor.vv v5, v4, v5
+; RV32V-NEXT:    vand.vx v4, v13, a0
+; RV32V-NEXT:    vsll.vx v4, v4, a2
+; RV32V-NEXT:    vor.vv v4, v1, v4
+; RV32V-NEXT:    lui a1, 4080
+; RV32V-NEXT:    vand.vx v1, v13, a1
+; RV32V-NEXT:    vand.vx v3, v3, a1
+; RV32V-NEXT:    vsll.vi v1, v1, 24
+; RV32V-NEXT:    vand.vv v2, v2, v12
+; RV32V-NEXT:    vand.vv v13, v13, v12
+; RV32V-NEXT:    vor.vv v3, v2, v3
+; RV32V-NEXT:    vsll.vi v13, v13, 8
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vor.vv v13, v1, v13
+; RV32V-NEXT:    vor.vv v13, v4, v13
+; RV32V-NEXT:    vor.vv v13, v13, v5
+; RV32V-NEXT:    vsrl.vi v5, v13, 4
+; RV32V-NEXT:    vand.vv v13, v13, v9
+; RV32V-NEXT:    vand.vv v5, v5, v9
+; RV32V-NEXT:    vsll.vi v13, v13, 4
+; RV32V-NEXT:    vor.vv v13, v5, v13
+; RV32V-NEXT:    vsrl.vi v5, v13, 2
+; RV32V-NEXT:    vand.vv v13, v13, v10
+; RV32V-NEXT:    vand.vv v5, v5, v10
+; RV32V-NEXT:    vsll.vi v13, v13, 2
+; RV32V-NEXT:    vor.vv v13, v5, v13
+; RV32V-NEXT:    vsrl.vi v5, v13, 1
+; RV32V-NEXT:    vand.vv v13, v13, v11
+; RV32V-NEXT:    vand.vv v5, v5, v11
+; RV32V-NEXT:    vadd.vv v13, v13, v13
+; RV32V-NEXT:    vor.vv v13, v5, v13
+; RV32V-NEXT:    vand.vx v4, v13, s3
+; RV32V-NEXT:    vsrl.vi v5, v8, 24
+; RV32V-NEXT:    vsrl.vx v3, v8, ra
+; RV32V-NEXT:    vsrl.vx v2, v8, a2
+; RV32V-NEXT:    vsll.vx v1, v8, ra
+; RV32V-NEXT:    vand.vx v2, v2, a0
+; RV32V-NEXT:    vor.vv v3, v2, v3
+; RV32V-NEXT:    vand.vx v2, v8, a0
+; RV32V-NEXT:    vsll.vx v2, v2, a2
+; RV32V-NEXT:    vor.vv v2, v1, v2
+; RV32V-NEXT:    vsrl.vi v1, v8, 8
+; RV32V-NEXT:    vand.vx v5, v5, a1
+; RV32V-NEXT:    vand.vv v1, v1, v12
+; RV32V-NEXT:    vor.vv v5, v1, v5
+; RV32V-NEXT:    vand.vx v1, v8, a1
+; RV32V-NEXT:    vsll.vi v1, v1, 24
+; RV32V-NEXT:    vor.vv v5, v5, v3
+; RV32V-NEXT:    vand.vv v3, v8, v12
+; RV32V-NEXT:    vsll.vi v3, v3, 8
+; RV32V-NEXT:    vor.vv v3, v1, v3
+; RV32V-NEXT:    li s9, 32
+; RV32V-NEXT:    vand.vx v1, v13, s9
+; RV32V-NEXT:    vor.vv v3, v2, v3
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vsrl.vi v3, v5, 4
+; RV32V-NEXT:    vand.vv v5, v5, v9
+; RV32V-NEXT:    vand.vv v3, v3, v9
+; RV32V-NEXT:    vsll.vi v5, v5, 4
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vsrl.vi v3, v5, 2
+; RV32V-NEXT:    vand.vv v5, v5, v10
+; RV32V-NEXT:    vand.vv v3, v3, v10
+; RV32V-NEXT:    vsll.vi v5, v5, 2
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vsrl.vi v3, v5, 1
+; RV32V-NEXT:    vand.vv v5, v5, v11
+; RV32V-NEXT:    vand.vv v3, v3, v11
+; RV32V-NEXT:    vadd.vv v5, v5, v5
+; RV32V-NEXT:    vor.vv v5, v3, v5
+; RV32V-NEXT:    vand.vi v3, v13, 2
+; RV32V-NEXT:    vand.vi v2, v13, 1
+; RV32V-NEXT:    vmul.vv v3, v5, v3
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v2, v3
+; RV32V-NEXT:    vand.vi v2, v13, 4
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vi v2, v13, 8
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v13, s10
+; RV32V-NEXT:    vmul.vv v4, v5, v4
+; RV32V-NEXT:    vxor.vv v4, v3, v4
+; RV32V-NEXT:    li s3, 128
+; RV32V-NEXT:    vand.vx v3, v13, s3
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v4, v4, v1
+; RV32V-NEXT:    li s3, 256
+; RV32V-NEXT:    vand.vx v1, v13, s3
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v4, v4, v2
+; RV32V-NEXT:    vand.vx v2, v13, s4
+; RV32V-NEXT:    vmul.vv v3, v5, v3
+; RV32V-NEXT:    vxor.vv v4, v4, v3
+; RV32V-NEXT:    vand.vx v3, v13, s2
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v1, v4, v1
+; RV32V-NEXT:    vxor.vv v2, v1, v2
+; RV32V-NEXT:    vand.vx v1, v13, s11
+; RV32V-NEXT:    vmul.vv v3, v5, v3
+; RV32V-NEXT:    vxor.vv v3, v2, v3
+; RV32V-NEXT:    lui s2, 1
+; RV32V-NEXT:    vand.vx v2, v13, s2
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v13, s5
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v13, s1
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v13, t4
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v13, t5
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v13, t3
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v13, t1
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v13, t2
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v13, a6
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v13, t0
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v13, a7
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v13, a4
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vxor.vv v3, v3, v2
+; RV32V-NEXT:    vand.vx v2, v13, a5
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v3, v3, v1
+; RV32V-NEXT:    vand.vx v1, v13, a3
+; RV32V-NEXT:    vmul.vv v2, v5, v2
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v3, v2
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v13, t6
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v13, s0
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v13, s6
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v13, s7
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v2, v2, v1
+; RV32V-NEXT:    vand.vx v1, v13, s8
+; RV32V-NEXT:    addi a3, sp, 112
+; RV32V-NEXT:    vmul.vv v1, v5, v1
+; RV32V-NEXT:    vxor.vv v1, v2, v1
+; RV32V-NEXT:    vlse64.v v2, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 104
+; RV32V-NEXT:    vand.vv v6, v13, v6
+; RV32V-NEXT:    vmul.vv v6, v5, v6
+; RV32V-NEXT:    vxor.vv v1, v1, v6
+; RV32V-NEXT:    vlse64.v v6, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 96
+; RV32V-NEXT:    vand.vv v7, v13, v7
+; RV32V-NEXT:    vmul.vv v7, v5, v7
+; RV32V-NEXT:    vxor.vv v1, v1, v7
+; RV32V-NEXT:    vlse64.v v7, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 88
+; RV32V-NEXT:    vand.vv v31, v13, v31
+; RV32V-NEXT:    vmul.vv v31, v5, v31
+; RV32V-NEXT:    vxor.vv v1, v1, v31
+; RV32V-NEXT:    vlse64.v v31, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 80
+; RV32V-NEXT:    vand.vv v30, v13, v30
+; RV32V-NEXT:    vmul.vv v30, v5, v30
+; RV32V-NEXT:    vxor.vv v1, v1, v30
+; RV32V-NEXT:    vlse64.v v30, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 72
+; RV32V-NEXT:    vand.vv v29, v13, v29
+; RV32V-NEXT:    vmul.vv v29, v5, v29
+; RV32V-NEXT:    vxor.vv v1, v1, v29
+; RV32V-NEXT:    vlse64.v v29, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 64
+; RV32V-NEXT:    vand.vv v28, v13, v28
+; RV32V-NEXT:    vmul.vv v28, v5, v28
+; RV32V-NEXT:    vxor.vv v1, v1, v28
+; RV32V-NEXT:    vlse64.v v28, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 56
+; RV32V-NEXT:    vand.vv v27, v13, v27
+; RV32V-NEXT:    vmul.vv v27, v5, v27
+; RV32V-NEXT:    vxor.vv v1, v1, v27
+; RV32V-NEXT:    vlse64.v v27, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 48
+; RV32V-NEXT:    vand.vv v22, v13, v22
+; RV32V-NEXT:    vmul.vv v22, v5, v22
+; RV32V-NEXT:    vxor.vv v1, v1, v22
+; RV32V-NEXT:    vlse64.v v22, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 40
+; RV32V-NEXT:    vand.vv v19, v13, v19
+; RV32V-NEXT:    vmul.vv v19, v5, v19
+; RV32V-NEXT:    vxor.vv v19, v1, v19
+; RV32V-NEXT:    vlse64.v v1, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 32
+; RV32V-NEXT:    vand.vv v18, v13, v18
+; RV32V-NEXT:    vand.vv v23, v13, v23
+; RV32V-NEXT:    vmul.vv v18, v5, v18
+; RV32V-NEXT:    vmul.vv v23, v5, v23
+; RV32V-NEXT:    vxor.vv v18, v19, v18
+; RV32V-NEXT:    vxor.vv v23, v18, v23
+; RV32V-NEXT:    vlse64.v v18, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 24
+; RV32V-NEXT:    vand.vv v16, v13, v16
+; RV32V-NEXT:    vmul.vv v16, v5, v16
+; RV32V-NEXT:    vxor.vv v16, v23, v16
+; RV32V-NEXT:    vlse64.v v23, (a3), zero
+; RV32V-NEXT:    addi a3, sp, 16
+; RV32V-NEXT:    vand.vv v14, v13, v14
+; RV32V-NEXT:    vmul.vv v14, v5, v14
+; RV32V-NEXT:    vxor.vv v14, v16, v14
+; RV32V-NEXT:    vlse64.v v16, (a3), zero
+; RV32V-NEXT:    vand.vv v15, v13, v15
+; RV32V-NEXT:    vmul.vv v15, v5, v15
+; RV32V-NEXT:    vxor.vv v14, v14, v15
+; RV32V-NEXT:    vand.vv v15, v13, v17
+; RV32V-NEXT:    vmul.vv v15, v5, v15
+; RV32V-NEXT:    vxor.vv v14, v14, v15
+; RV32V-NEXT:    vand.vv v15, v13, v20
+; RV32V-NEXT:    vand.vv v17, v13, v25
+; RV32V-NEXT:    vmul.vv v15, v5, v15
+; RV32V-NEXT:    vmul.vv v17, v5, v17
+; RV32V-NEXT:    vxor.vv v14, v14, v15
+; RV32V-NEXT:    vand.vx v15, v19, a1
+; RV32V-NEXT:    vxor.vv v14, v14, v17
+; RV32V-NEXT:    vsrl.vi v17, v14, 24
+; RV32V-NEXT:    vand.vx v17, v17, a1
+; RV32V-NEXT:    vand.vv v20, v13, v21
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v13, v24
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v13, v26
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v13, v2
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v13, v6
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v13, v7
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v13, v31
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vand.vv v20, v13, v30
+; RV32V-NEXT:    vmul.vv v20, v5, v20
+; RV32V-NEXT:    vand.vx v21, v3, a0
+; RV32V-NEXT:    vsll.vx v21, v21, a2
+; RV32V-NEXT:    vxor.vv v14, v14, v20
+; RV32V-NEXT:    vsrl.vx v20, v14, a2
+; RV32V-NEXT:    vand.vx v20, v20, a0
+; RV32V-NEXT:    vand.vv v24, v13, v29
+; RV32V-NEXT:    vand.vv v25, v13, v28
+; RV32V-NEXT:    vand.vv v26, v13, v27
+; RV32V-NEXT:    vand.vv v22, v13, v22
+; RV32V-NEXT:    vand.vv v27, v13, v1
+; RV32V-NEXT:    vand.vv v18, v13, v18
+; RV32V-NEXT:    vand.vv v23, v13, v23
+; RV32V-NEXT:    vand.vv v13, v13, v16
+; RV32V-NEXT:    vmul.vv v16, v5, v24
+; RV32V-NEXT:    vmul.vv v24, v5, v25
+; RV32V-NEXT:    vmul.vv v25, v5, v26
+; RV32V-NEXT:    vmul.vv v22, v5, v22
+; RV32V-NEXT:    vmul.vv v26, v5, v27
+; RV32V-NEXT:    vmul.vv v18, v5, v18
+; RV32V-NEXT:    vmul.vv v23, v5, v23
+; RV32V-NEXT:    vmul.vv v13, v5, v13
+; RV32V-NEXT:    vxor.vv v16, v14, v16
+; RV32V-NEXT:    vxor.vv v16, v16, v24
+; RV32V-NEXT:    vxor.vv v16, v16, v25
+; RV32V-NEXT:    vxor.vv v16, v16, v22
+; RV32V-NEXT:    vxor.vv v16, v16, v26
+; RV32V-NEXT:    vxor.vv v16, v16, v18
+; RV32V-NEXT:    vxor.vv v16, v16, v23
+; RV32V-NEXT:    vxor.vv v13, v16, v13
+; RV32V-NEXT:    vsll.vx v16, v4, ra
+; RV32V-NEXT:    vsrl.vx v13, v13, ra
+; RV32V-NEXT:    vor.vv v16, v16, v21
+; RV32V-NEXT:    vsrl.vi v18, v19, 8
+; RV32V-NEXT:    vsll.vi v15, v15, 24
+; RV32V-NEXT:    vand.vv v18, v18, v12
+; RV32V-NEXT:    vor.vv v17, v18, v17
+; RV32V-NEXT:    vand.vv v12, v14, v12
+; RV32V-NEXT:    vsll.vi v12, v12, 8
+; RV32V-NEXT:    vor.vv v12, v15, v12
+; RV32V-NEXT:    vor.vv v12, v16, v12
+; RV32V-NEXT:    vor.vv v13, v20, v13
+; RV32V-NEXT:    vor.vv v13, v17, v13
+; RV32V-NEXT:    vor.vv v12, v12, v13
+; RV32V-NEXT:    vsrl.vi v13, v12, 4
+; RV32V-NEXT:    vand.vv v12, v12, v9
+; RV32V-NEXT:    vand.vv v9, v13, v9
+; RV32V-NEXT:    vsll.vi v12, v12, 4
+; RV32V-NEXT:    vor.vv v9, v9, v12
+; RV32V-NEXT:    vsrl.vi v12, v9, 2
 ; RV32V-NEXT:    vand.vv v9, v9, v10
-; RV32V-NEXT:    vadd.vv v8, v8, v8
-; RV32V-NEXT:    vor.vv v9, v9, v8
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 1
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add a0, sp, a0
-; RV32V-NEXT:    addi a0, a0, 304
-; RV32V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; RV32V-NEXT:    vand.vv v10, v12, v10
+; RV32V-NEXT:    vsll.vi v9, v9, 2
+; RV32V-NEXT:    vor.vv v9, v10, v9
+; RV32V-NEXT:    vsrl.vi v10, v9, 1
+; RV32V-NEXT:    vand.vv v9, v9, v11
+; RV32V-NEXT:    vand.vv v10, v10, v11
+; RV32V-NEXT:    vadd.vv v9, v9, v9
+; RV32V-NEXT:    vor.vv v9, v10, v9
 ; RV32V-NEXT:    vsrl.vi v8, v9, 1, v0.t
-; RV32V-NEXT:    csrr a0, vlenb
-; RV32V-NEXT:    mv a1, a0
-; RV32V-NEXT:    slli a0, a0, 3
-; RV32V-NEXT:    add a1, a1, a0
-; RV32V-NEXT:    slli a0, a0, 2
-; RV32V-NEXT:    add a0, a0, a1
-; RV32V-NEXT:    add sp, sp, a0
-; RV32V-NEXT:    .cfi_def_cfa sp, 368
-; RV32V-NEXT:    lw ra, 364(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s0, 360(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s1, 356(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s2, 352(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s3, 348(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s4, 344(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s5, 340(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s6, 336(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s7, 332(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s8, 328(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s9, 324(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s10, 320(sp) # 4-byte Folded Reload
-; RV32V-NEXT:    lw s11, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw ra, 332(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s0, 328(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 324(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 320(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 316(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 312(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 308(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 304(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 300(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s8, 296(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s9, 292(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s10, 288(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s11, 284(sp) # 4-byte Folded Reload
 ; RV32V-NEXT:    .cfi_restore ra
 ; RV32V-NEXT:    .cfi_restore s0
 ; RV32V-NEXT:    .cfi_restore s1
@@ -55613,7 +45088,7 @@ define <vscale x 1 x i64> @clmulh_nxv1i64_vx_mask(<vscale x 1 x i64> %va, i64 %b
 ; RV32V-NEXT:    .cfi_restore s9
 ; RV32V-NEXT:    .cfi_restore s10
 ; RV32V-NEXT:    .cfi_restore s11
-; RV32V-NEXT:    addi sp, sp, 368
+; RV32V-NEXT:    addi sp, sp, 336
 ; RV32V-NEXT:    .cfi_def_cfa_offset 0
 ; RV32V-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll
index 0c9e96e2a1694..d8a597b5413fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll
@@ -716,139 +716,139 @@ define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
 ; RV32-NEXT:    lui a6, 8192
 ; RV32-NEXT:    lui a5, 16384
 ; RV32-NEXT:    lui a3, 32768
-; RV32-NEXT:    sw a1, 272(sp)
-; RV32-NEXT:    sw zero, 276(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    sw zero, 272(sp)
+; RV32-NEXT:    sw t5, 276(sp)
 ; RV32-NEXT:    sw zero, 264(sp)
-; RV32-NEXT:    sw t5, 268(sp)
-; RV32-NEXT:    sw zero, 256(sp)
-; RV32-NEXT:    sw a4, 260(sp)
+; RV32-NEXT:    sw a4, 268(sp)
 ; RV32-NEXT:    lui a4, 65536
-; RV32-NEXT:    sw zero, 248(sp)
-; RV32-NEXT:    sw a2, 252(sp)
+; RV32-NEXT:    sw zero, 256(sp)
+; RV32-NEXT:    sw a2, 260(sp)
 ; RV32-NEXT:    lui a2, 131072
-; RV32-NEXT:    sw zero, 240(sp)
-; RV32-NEXT:    sw s11, 244(sp)
+; RV32-NEXT:    sw zero, 248(sp)
+; RV32-NEXT:    sw s11, 252(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vi v13, v9, 2
 ; RV32-NEXT:    vand.vi v14, v9, 1
 ; RV32-NEXT:    vand.vi v12, v9, 4
 ; RV32-NEXT:    vand.vi v11, v9, 8
-; RV32-NEXT:    sw zero, 232(sp)
-; RV32-NEXT:    sw a0, 236(sp)
+; RV32-NEXT:    sw zero, 240(sp)
+; RV32-NEXT:    sw a0, 244(sp)
 ; RV32-NEXT:    vand.vx v10, v9, a0
-; RV32-NEXT:    addi s11, sp, 272
-; RV32-NEXT:    sw zero, 224(sp)
-; RV32-NEXT:    sw ra, 228(sp)
+; RV32-NEXT:    addi s11, sp, 16
+; RV32-NEXT:    sw zero, 232(sp)
+; RV32-NEXT:    sw ra, 236(sp)
 ; RV32-NEXT:    vand.vx v15, v9, ra
-; RV32-NEXT:    addi ra, sp, 264
-; RV32-NEXT:    sw zero, 216(sp)
-; RV32-NEXT:    sw s10, 220(sp)
+; RV32-NEXT:    addi ra, sp, 272
+; RV32-NEXT:    sw zero, 224(sp)
+; RV32-NEXT:    sw s10, 228(sp)
 ; RV32-NEXT:    vand.vx v16, v9, s10
-; RV32-NEXT:    addi s10, sp, 256
-; RV32-NEXT:    sw zero, 208(sp)
-; RV32-NEXT:    sw s9, 212(sp)
+; RV32-NEXT:    addi s10, sp, 264
+; RV32-NEXT:    sw zero, 216(sp)
+; RV32-NEXT:    sw s9, 220(sp)
 ; RV32-NEXT:    vand.vx v17, v9, s9
-; RV32-NEXT:    addi s9, sp, 248
-; RV32-NEXT:    sw zero, 200(sp)
-; RV32-NEXT:    sw s8, 204(sp)
+; RV32-NEXT:    addi s9, sp, 256
+; RV32-NEXT:    sw zero, 208(sp)
+; RV32-NEXT:    sw s8, 212(sp)
 ; RV32-NEXT:    vand.vx v18, v9, s8
-; RV32-NEXT:    addi s8, sp, 240
-; RV32-NEXT:    sw zero, 192(sp)
-; RV32-NEXT:    sw s7, 196(sp)
+; RV32-NEXT:    addi s8, sp, 248
+; RV32-NEXT:    sw zero, 200(sp)
+; RV32-NEXT:    sw s7, 204(sp)
 ; RV32-NEXT:    vand.vx v19, v9, s7
-; RV32-NEXT:    addi s7, sp, 232
-; RV32-NEXT:    sw zero, 184(sp)
-; RV32-NEXT:    sw s1, 188(sp)
+; RV32-NEXT:    addi s7, sp, 240
+; RV32-NEXT:    sw zero, 192(sp)
+; RV32-NEXT:    sw s1, 196(sp)
 ; RV32-NEXT:    vand.vx v20, v9, s1
 ; RV32-NEXT:    slli t5, t5, 11
 ; RV32-NEXT:    vand.vx v21, v9, s6
+; RV32-NEXT:    sw zero, 184(sp)
+; RV32-NEXT:    sw t5, 188(sp)
 ; RV32-NEXT:    sw zero, 176(sp)
-; RV32-NEXT:    sw t5, 180(sp)
-; RV32-NEXT:    sw zero, 168(sp)
-; RV32-NEXT:    sw s6, 172(sp)
-; RV32-NEXT:    addi s6, sp, 216
+; RV32-NEXT:    sw s6, 180(sp)
+; RV32-NEXT:    addi s6, sp, 224
 ; RV32-NEXT:    vand.vx v22, v9, s5
-; RV32-NEXT:    sw zero, 160(sp)
-; RV32-NEXT:    sw s5, 164(sp)
-; RV32-NEXT:    addi s5, sp, 208
+; RV32-NEXT:    sw zero, 168(sp)
+; RV32-NEXT:    sw s5, 172(sp)
+; RV32-NEXT:    addi s5, sp, 216
 ; RV32-NEXT:    vand.vx v23, v9, s4
-; RV32-NEXT:    sw zero, 152(sp)
-; RV32-NEXT:    sw s4, 156(sp)
-; RV32-NEXT:    addi s4, sp, 200
+; RV32-NEXT:    sw zero, 160(sp)
+; RV32-NEXT:    sw s4, 164(sp)
+; RV32-NEXT:    addi s4, sp, 208
 ; RV32-NEXT:    vand.vx v24, v9, s3
-; RV32-NEXT:    sw zero, 144(sp)
-; RV32-NEXT:    sw s3, 148(sp)
-; RV32-NEXT:    addi s3, sp, 192
+; RV32-NEXT:    sw zero, 152(sp)
+; RV32-NEXT:    sw s3, 156(sp)
+; RV32-NEXT:    addi s3, sp, 200
 ; RV32-NEXT:    vand.vx v25, v9, s2
-; RV32-NEXT:    sw zero, 136(sp)
-; RV32-NEXT:    sw s2, 140(sp)
-; RV32-NEXT:    addi s2, sp, 184
+; RV32-NEXT:    sw zero, 144(sp)
+; RV32-NEXT:    sw s2, 148(sp)
+; RV32-NEXT:    addi s2, sp, 192
 ; RV32-NEXT:    vand.vx v26, v9, s0
-; RV32-NEXT:    sw zero, 128(sp)
-; RV32-NEXT:    sw s0, 132(sp)
-; RV32-NEXT:    addi s1, sp, 176
+; RV32-NEXT:    sw zero, 136(sp)
+; RV32-NEXT:    sw s0, 140(sp)
+; RV32-NEXT:    addi s1, sp, 184
 ; RV32-NEXT:    vand.vx v27, v9, t6
-; RV32-NEXT:    sw zero, 120(sp)
-; RV32-NEXT:    sw t6, 124(sp)
-; RV32-NEXT:    addi s0, sp, 168
+; RV32-NEXT:    sw zero, 128(sp)
+; RV32-NEXT:    sw t6, 132(sp)
+; RV32-NEXT:    addi s0, sp, 176
 ; RV32-NEXT:    vand.vx v28, v9, t4
-; RV32-NEXT:    sw zero, 112(sp)
-; RV32-NEXT:    sw t4, 116(sp)
-; RV32-NEXT:    addi t6, sp, 160
+; RV32-NEXT:    sw zero, 120(sp)
+; RV32-NEXT:    sw t4, 124(sp)
+; RV32-NEXT:    addi t6, sp, 168
 ; RV32-NEXT:    vand.vx v29, v9, t3
-; RV32-NEXT:    sw zero, 104(sp)
-; RV32-NEXT:    sw t3, 108(sp)
-; RV32-NEXT:    addi t4, sp, 152
+; RV32-NEXT:    sw zero, 112(sp)
+; RV32-NEXT:    sw t3, 116(sp)
+; RV32-NEXT:    addi t4, sp, 160
 ; RV32-NEXT:    vand.vx v30, v9, t2
-; RV32-NEXT:    sw zero, 96(sp)
-; RV32-NEXT:    sw t2, 100(sp)
-; RV32-NEXT:    addi t3, sp, 144
+; RV32-NEXT:    sw zero, 104(sp)
+; RV32-NEXT:    sw t2, 108(sp)
+; RV32-NEXT:    addi t3, sp, 152
 ; RV32-NEXT:    vand.vx v31, v9, t1
-; RV32-NEXT:    sw zero, 88(sp)
-; RV32-NEXT:    sw t1, 92(sp)
-; RV32-NEXT:    addi t2, sp, 136
+; RV32-NEXT:    sw zero, 96(sp)
+; RV32-NEXT:    sw t1, 100(sp)
+; RV32-NEXT:    addi t2, sp, 144
 ; RV32-NEXT:    vand.vx v7, v9, t0
-; RV32-NEXT:    sw zero, 80(sp)
-; RV32-NEXT:    sw t0, 84(sp)
-; RV32-NEXT:    addi t1, sp, 128
+; RV32-NEXT:    sw zero, 88(sp)
+; RV32-NEXT:    sw t0, 92(sp)
+; RV32-NEXT:    addi t1, sp, 136
 ; RV32-NEXT:    vand.vx v6, v9, a7
-; RV32-NEXT:    sw zero, 72(sp)
-; RV32-NEXT:    sw a7, 76(sp)
-; RV32-NEXT:    addi t0, sp, 120
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw a7, 84(sp)
+; RV32-NEXT:    addi t0, sp, 128
 ; RV32-NEXT:    vand.vx v5, v9, a6
-; RV32-NEXT:    sw zero, 64(sp)
-; RV32-NEXT:    sw a6, 68(sp)
-; RV32-NEXT:    addi a7, sp, 112
+; RV32-NEXT:    sw zero, 72(sp)
+; RV32-NEXT:    sw a6, 76(sp)
+; RV32-NEXT:    addi a7, sp, 120
 ; RV32-NEXT:    vand.vx v4, v9, a5
-; RV32-NEXT:    sw zero, 56(sp)
-; RV32-NEXT:    sw a5, 60(sp)
-; RV32-NEXT:    addi a6, sp, 104
+; RV32-NEXT:    sw zero, 64(sp)
+; RV32-NEXT:    sw a5, 68(sp)
+; RV32-NEXT:    addi a6, sp, 112
 ; RV32-NEXT:    vand.vx v3, v9, a3
-; RV32-NEXT:    sw zero, 48(sp)
-; RV32-NEXT:    sw a3, 52(sp)
-; RV32-NEXT:    addi a5, sp, 96
+; RV32-NEXT:    sw zero, 56(sp)
+; RV32-NEXT:    sw a3, 60(sp)
+; RV32-NEXT:    addi a5, sp, 104
 ; RV32-NEXT:    vand.vx v2, v9, a4
-; RV32-NEXT:    sw zero, 40(sp)
-; RV32-NEXT:    sw a4, 44(sp)
-; RV32-NEXT:    addi a4, sp, 88
+; RV32-NEXT:    sw zero, 48(sp)
+; RV32-NEXT:    sw a4, 52(sp)
+; RV32-NEXT:    addi a4, sp, 96
 ; RV32-NEXT:    vand.vx v1, v9, a2
+; RV32-NEXT:    sw zero, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    addi a3, sp, 88
 ; RV32-NEXT:    sw zero, 32(sp)
-; RV32-NEXT:    sw a2, 36(sp)
-; RV32-NEXT:    addi a3, sp, 80
-; RV32-NEXT:    sw zero, 24(sp)
 ; RV32-NEXT:    lui a0, 262144
-; RV32-NEXT:    sw a0, 28(sp)
-; RV32-NEXT:    sw zero, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a2, sp, 72
+; RV32-NEXT:    sw a0, 36(sp)
+; RV32-NEXT:    sw zero, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    addi a2, sp, 80
 ; RV32-NEXT:    vand.vx v0, v9, t5
-; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    addi a1, sp, 72
 ; RV32-NEXT:    vmul.vv v13, v8, v13
 ; RV32-NEXT:    vmul.vv v14, v8, v14
 ; RV32-NEXT:    vxor.vi v14, v14, 0
 ; RV32-NEXT:    vxor.vv v14, v14, v13
 ; RV32-NEXT:    vlse64.v v13, (s11), zero
-; RV32-NEXT:    addi s11, sp, 56
+; RV32-NEXT:    addi s11, sp, 64
 ; RV32-NEXT:    vmul.vv v12, v8, v12
 ; RV32-NEXT:    vxor.vv v14, v14, v12
 ; RV32-NEXT:    vlse64.v v12, (ra), zero
@@ -860,7 +860,7 @@ define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
 ; RV32-NEXT:    add t5, sp, t5
 ; RV32-NEXT:    addi t5, t5, 288
 ; RV32-NEXT:    vs1r.v v12, (t5) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi ra, sp, 48
+; RV32-NEXT:    addi ra, sp, 56
 ; RV32-NEXT:    vmul.vv v11, v8, v11
 ; RV32-NEXT:    vxor.vv v14, v14, v11
 ; RV32-NEXT:    vlse64.v v11, (s10), zero
@@ -870,7 +870,7 @@ define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
 ; RV32-NEXT:    add t5, sp, t5
 ; RV32-NEXT:    addi t5, t5, 288
 ; RV32-NEXT:    vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi s10, sp, 40
+; RV32-NEXT:    addi s10, sp, 48
 ; RV32-NEXT:    vmul.vv v10, v8, v10
 ; RV32-NEXT:    vxor.vv v14, v14, v10
 ; RV32-NEXT:    vlse64.v v10, (s9), zero
@@ -879,7 +879,7 @@ define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
 ; RV32-NEXT:    add t5, sp, t5
 ; RV32-NEXT:    addi t5, t5, 288
 ; RV32-NEXT:    vs1r.v v10, (t5) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi t5, sp, 32
+; RV32-NEXT:    addi t5, sp, 40
 ; RV32-NEXT:    vmul.vv v15, v8, v15
 ; RV32-NEXT:    vxor.vv v15, v14, v15
 ; RV32-NEXT:    vlse64.v v10, (s8), zero
@@ -889,7 +889,7 @@ define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
 ; RV32-NEXT:    add s8, sp, s8
 ; RV32-NEXT:    addi s8, s8, 288
 ; RV32-NEXT:    vs1r.v v10, (s8) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi s8, sp, 24
+; RV32-NEXT:    addi s8, sp, 32
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v16, v15, v16
 ; RV32-NEXT:    vlse64.v v10, (s7), zero
@@ -898,7 +898,7 @@ define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
 ; RV32-NEXT:    add s7, sp, s7
 ; RV32-NEXT:    addi s7, s7, 288
 ; RV32-NEXT:    vs1r.v v10, (s7) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi s7, sp, 16
+; RV32-NEXT:    addi s7, sp, 24
 ; RV32-NEXT:    vmul.vv v17, v8, v17
 ; RV32-NEXT:    vmul.vv v18, v8, v18
 ; RV32-NEXT:    vmul.vv v19, v8, v19
@@ -923,7 +923,7 @@ define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
 ; RV32-NEXT:    vmul.vv v1, v8, v1
 ; RV32-NEXT:    vmul.vv v0, v8, v0
 ; RV32-NEXT:    vxor.vv v16, v16, v17
-; RV32-NEXT:    addi s9, sp, 224
+; RV32-NEXT:    addi s9, sp, 232
 ; RV32-NEXT:    vlse64.v v11, (s9), zero
 ; RV32-NEXT:    vxor.vv v16, v16, v18
 ; RV32-NEXT:    vlse64.v v10, (s6), zero
@@ -1507,139 +1507,139 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; RV32-NEXT:    lui a6, 8192
 ; RV32-NEXT:    lui a5, 16384
 ; RV32-NEXT:    lui a3, 32768
-; RV32-NEXT:    sw a1, 272(sp)
-; RV32-NEXT:    sw zero, 276(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    sw zero, 272(sp)
+; RV32-NEXT:    sw t5, 276(sp)
 ; RV32-NEXT:    sw zero, 264(sp)
-; RV32-NEXT:    sw t5, 268(sp)
-; RV32-NEXT:    sw zero, 256(sp)
-; RV32-NEXT:    sw a4, 260(sp)
+; RV32-NEXT:    sw a4, 268(sp)
 ; RV32-NEXT:    lui a4, 65536
-; RV32-NEXT:    sw zero, 248(sp)
-; RV32-NEXT:    sw a2, 252(sp)
+; RV32-NEXT:    sw zero, 256(sp)
+; RV32-NEXT:    sw a2, 260(sp)
 ; RV32-NEXT:    lui a2, 131072
-; RV32-NEXT:    sw zero, 240(sp)
-; RV32-NEXT:    sw s11, 244(sp)
+; RV32-NEXT:    sw zero, 248(sp)
+; RV32-NEXT:    sw s11, 252(sp)
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vi v13, v9, 2
 ; RV32-NEXT:    vand.vi v14, v9, 1
 ; RV32-NEXT:    vand.vi v12, v9, 4
 ; RV32-NEXT:    vand.vi v11, v9, 8
-; RV32-NEXT:    sw zero, 232(sp)
-; RV32-NEXT:    sw a0, 236(sp)
+; RV32-NEXT:    sw zero, 240(sp)
+; RV32-NEXT:    sw a0, 244(sp)
 ; RV32-NEXT:    vand.vx v10, v9, a0
-; RV32-NEXT:    addi s11, sp, 272
-; RV32-NEXT:    sw zero, 224(sp)
-; RV32-NEXT:    sw ra, 228(sp)
+; RV32-NEXT:    addi s11, sp, 16
+; RV32-NEXT:    sw zero, 232(sp)
+; RV32-NEXT:    sw ra, 236(sp)
 ; RV32-NEXT:    vand.vx v15, v9, ra
-; RV32-NEXT:    addi ra, sp, 264
-; RV32-NEXT:    sw zero, 216(sp)
-; RV32-NEXT:    sw s10, 220(sp)
+; RV32-NEXT:    addi ra, sp, 272
+; RV32-NEXT:    sw zero, 224(sp)
+; RV32-NEXT:    sw s10, 228(sp)
 ; RV32-NEXT:    vand.vx v16, v9, s10
-; RV32-NEXT:    addi s10, sp, 256
-; RV32-NEXT:    sw zero, 208(sp)
-; RV32-NEXT:    sw s9, 212(sp)
+; RV32-NEXT:    addi s10, sp, 264
+; RV32-NEXT:    sw zero, 216(sp)
+; RV32-NEXT:    sw s9, 220(sp)
 ; RV32-NEXT:    vand.vx v17, v9, s9
-; RV32-NEXT:    addi s9, sp, 248
-; RV32-NEXT:    sw zero, 200(sp)
-; RV32-NEXT:    sw s8, 204(sp)
+; RV32-NEXT:    addi s9, sp, 256
+; RV32-NEXT:    sw zero, 208(sp)
+; RV32-NEXT:    sw s8, 212(sp)
 ; RV32-NEXT:    vand.vx v18, v9, s8
-; RV32-NEXT:    addi s8, sp, 240
-; RV32-NEXT:    sw zero, 192(sp)
-; RV32-NEXT:    sw s7, 196(sp)
+; RV32-NEXT:    addi s8, sp, 248
+; RV32-NEXT:    sw zero, 200(sp)
+; RV32-NEXT:    sw s7, 204(sp)
 ; RV32-NEXT:    vand.vx v19, v9, s7
-; RV32-NEXT:    addi s7, sp, 232
-; RV32-NEXT:    sw zero, 184(sp)
-; RV32-NEXT:    sw s1, 188(sp)
+; RV32-NEXT:    addi s7, sp, 240
+; RV32-NEXT:    sw zero, 192(sp)
+; RV32-NEXT:    sw s1, 196(sp)
 ; RV32-NEXT:    vand.vx v20, v9, s1
 ; RV32-NEXT:    slli t5, t5, 11
 ; RV32-NEXT:    vand.vx v21, v9, s6
+; RV32-NEXT:    sw zero, 184(sp)
+; RV32-NEXT:    sw t5, 188(sp)
 ; RV32-NEXT:    sw zero, 176(sp)
-; RV32-NEXT:    sw t5, 180(sp)
-; RV32-NEXT:    sw zero, 168(sp)
-; RV32-NEXT:    sw s6, 172(sp)
-; RV32-NEXT:    addi s6, sp, 216
+; RV32-NEXT:    sw s6, 180(sp)
+; RV32-NEXT:    addi s6, sp, 224
 ; RV32-NEXT:    vand.vx v22, v9, s5
-; RV32-NEXT:    sw zero, 160(sp)
-; RV32-NEXT:    sw s5, 164(sp)
-; RV32-NEXT:    addi s5, sp, 208
+; RV32-NEXT:    sw zero, 168(sp)
+; RV32-NEXT:    sw s5, 172(sp)
+; RV32-NEXT:    addi s5, sp, 216
 ; RV32-NEXT:    vand.vx v23, v9, s4
-; RV32-NEXT:    sw zero, 152(sp)
-; RV32-NEXT:    sw s4, 156(sp)
-; RV32-NEXT:    addi s4, sp, 200
+; RV32-NEXT:    sw zero, 160(sp)
+; RV32-NEXT:    sw s4, 164(sp)
+; RV32-NEXT:    addi s4, sp, 208
 ; RV32-NEXT:    vand.vx v24, v9, s3
-; RV32-NEXT:    sw zero, 144(sp)
-; RV32-NEXT:    sw s3, 148(sp)
-; RV32-NEXT:    addi s3, sp, 192
+; RV32-NEXT:    sw zero, 152(sp)
+; RV32-NEXT:    sw s3, 156(sp)
+; RV32-NEXT:    addi s3, sp, 200
 ; RV32-NEXT:    vand.vx v25, v9, s2
-; RV32-NEXT:    sw zero, 136(sp)
-; RV32-NEXT:    sw s2, 140(sp)
-; RV32-NEXT:    addi s2, sp, 184
+; RV32-NEXT:    sw zero, 144(sp)
+; RV32-NEXT:    sw s2, 148(sp)
+; RV32-NEXT:    addi s2, sp, 192
 ; RV32-NEXT:    vand.vx v26, v9, s0
-; RV32-NEXT:    sw zero, 128(sp)
-; RV32-NEXT:    sw s0, 132(sp)
-; RV32-NEXT:    addi s1, sp, 176
+; RV32-NEXT:    sw zero, 136(sp)
+; RV32-NEXT:    sw s0, 140(sp)
+; RV32-NEXT:    addi s1, sp, 184
 ; RV32-NEXT:    vand.vx v27, v9, t6
-; RV32-NEXT:    sw zero, 120(sp)
-; RV32-NEXT:    sw t6, 124(sp)
-; RV32-NEXT:    addi s0, sp, 168
+; RV32-NEXT:    sw zero, 128(sp)
+; RV32-NEXT:    sw t6, 132(sp)
+; RV32-NEXT:    addi s0, sp, 176
 ; RV32-NEXT:    vand.vx v28, v9, t4
-; RV32-NEXT:    sw zero, 112(sp)
-; RV32-NEXT:    sw t4, 116(sp)
-; RV32-NEXT:    addi t6, sp, 160
+; RV32-NEXT:    sw zero, 120(sp)
+; RV32-NEXT:    sw t4, 124(sp)
+; RV32-NEXT:    addi t6, sp, 168
 ; RV32-NEXT:    vand.vx v29, v9, t3
-; RV32-NEXT:    sw zero, 104(sp)
-; RV32-NEXT:    sw t3, 108(sp)
-; RV32-NEXT:    addi t4, sp, 152
+; RV32-NEXT:    sw zero, 112(sp)
+; RV32-NEXT:    sw t3, 116(sp)
+; RV32-NEXT:    addi t4, sp, 160
 ; RV32-NEXT:    vand.vx v30, v9, t2
-; RV32-NEXT:    sw zero, 96(sp)
-; RV32-NEXT:    sw t2, 100(sp)
-; RV32-NEXT:    addi t3, sp, 144
+; RV32-NEXT:    sw zero, 104(sp)
+; RV32-NEXT:    sw t2, 108(sp)
+; RV32-NEXT:    addi t3, sp, 152
 ; RV32-NEXT:    vand.vx v31, v9, t1
-; RV32-NEXT:    sw zero, 88(sp)
-; RV32-NEXT:    sw t1, 92(sp)
-; RV32-NEXT:    addi t2, sp, 136
+; RV32-NEXT:    sw zero, 96(sp)
+; RV32-NEXT:    sw t1, 100(sp)
+; RV32-NEXT:    addi t2, sp, 144
 ; RV32-NEXT:    vand.vx v7, v9, t0
-; RV32-NEXT:    sw zero, 80(sp)
-; RV32-NEXT:    sw t0, 84(sp)
-; RV32-NEXT:    addi t1, sp, 128
+; RV32-NEXT:    sw zero, 88(sp)
+; RV32-NEXT:    sw t0, 92(sp)
+; RV32-NEXT:    addi t1, sp, 136
 ; RV32-NEXT:    vand.vx v6, v9, a7
-; RV32-NEXT:    sw zero, 72(sp)
-; RV32-NEXT:    sw a7, 76(sp)
-; RV32-NEXT:    addi t0, sp, 120
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw a7, 84(sp)
+; RV32-NEXT:    addi t0, sp, 128
 ; RV32-NEXT:    vand.vx v5, v9, a6
-; RV32-NEXT:    sw zero, 64(sp)
-; RV32-NEXT:    sw a6, 68(sp)
-; RV32-NEXT:    addi a7, sp, 112
+; RV32-NEXT:    sw zero, 72(sp)
+; RV32-NEXT:    sw a6, 76(sp)
+; RV32-NEXT:    addi a7, sp, 120
 ; RV32-NEXT:    vand.vx v4, v9, a5
-; RV32-NEXT:    sw zero, 56(sp)
-; RV32-NEXT:    sw a5, 60(sp)
-; RV32-NEXT:    addi a6, sp, 104
+; RV32-NEXT:    sw zero, 64(sp)
+; RV32-NEXT:    sw a5, 68(sp)
+; RV32-NEXT:    addi a6, sp, 112
 ; RV32-NEXT:    vand.vx v3, v9, a3
-; RV32-NEXT:    sw zero, 48(sp)
-; RV32-NEXT:    sw a3, 52(sp)
-; RV32-NEXT:    addi a5, sp, 96
+; RV32-NEXT:    sw zero, 56(sp)
+; RV32-NEXT:    sw a3, 60(sp)
+; RV32-NEXT:    addi a5, sp, 104
 ; RV32-NEXT:    vand.vx v2, v9, a4
-; RV32-NEXT:    sw zero, 40(sp)
-; RV32-NEXT:    sw a4, 44(sp)
-; RV32-NEXT:    addi a4, sp, 88
+; RV32-NEXT:    sw zero, 48(sp)
+; RV32-NEXT:    sw a4, 52(sp)
+; RV32-NEXT:    addi a4, sp, 96
 ; RV32-NEXT:    vand.vx v1, v9, a2
+; RV32-NEXT:    sw zero, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    addi a3, sp, 88
 ; RV32-NEXT:    sw zero, 32(sp)
-; RV32-NEXT:    sw a2, 36(sp)
-; RV32-NEXT:    addi a3, sp, 80
-; RV32-NEXT:    sw zero, 24(sp)
 ; RV32-NEXT:    lui a0, 262144
-; RV32-NEXT:    sw a0, 28(sp)
-; RV32-NEXT:    sw zero, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a2, sp, 72
+; RV32-NEXT:    sw a0, 36(sp)
+; RV32-NEXT:    sw zero, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    addi a2, sp, 80
 ; RV32-NEXT:    vand.vx v0, v9, t5
-; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    addi a1, sp, 72
 ; RV32-NEXT:    vmul.vv v13, v8, v13
 ; RV32-NEXT:    vmul.vv v14, v8, v14
 ; RV32-NEXT:    vxor.vi v14, v14, 0
 ; RV32-NEXT:    vxor.vv v14, v14, v13
 ; RV32-NEXT:    vlse64.v v13, (s11), zero
-; RV32-NEXT:    addi s11, sp, 56
+; RV32-NEXT:    addi s11, sp, 64
 ; RV32-NEXT:    vmul.vv v12, v8, v12
 ; RV32-NEXT:    vxor.vv v14, v14, v12
 ; RV32-NEXT:    vlse64.v v12, (ra), zero
@@ -1651,7 +1651,7 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; RV32-NEXT:    add t5, sp, t5
 ; RV32-NEXT:    addi t5, t5, 288
 ; RV32-NEXT:    vs1r.v v12, (t5) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi ra, sp, 48
+; RV32-NEXT:    addi ra, sp, 56
 ; RV32-NEXT:    vmul.vv v11, v8, v11
 ; RV32-NEXT:    vxor.vv v14, v14, v11
 ; RV32-NEXT:    vlse64.v v11, (s10), zero
@@ -1661,7 +1661,7 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; RV32-NEXT:    add t5, sp, t5
 ; RV32-NEXT:    addi t5, t5, 288
 ; RV32-NEXT:    vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi s10, sp, 40
+; RV32-NEXT:    addi s10, sp, 48
 ; RV32-NEXT:    vmul.vv v10, v8, v10
 ; RV32-NEXT:    vxor.vv v14, v14, v10
 ; RV32-NEXT:    vlse64.v v10, (s9), zero
@@ -1670,7 +1670,7 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; RV32-NEXT:    add t5, sp, t5
 ; RV32-NEXT:    addi t5, t5, 288
 ; RV32-NEXT:    vs1r.v v10, (t5) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi t5, sp, 32
+; RV32-NEXT:    addi t5, sp, 40
 ; RV32-NEXT:    vmul.vv v15, v8, v15
 ; RV32-NEXT:    vxor.vv v15, v14, v15
 ; RV32-NEXT:    vlse64.v v10, (s8), zero
@@ -1680,7 +1680,7 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; RV32-NEXT:    add s8, sp, s8
 ; RV32-NEXT:    addi s8, s8, 288
 ; RV32-NEXT:    vs1r.v v10, (s8) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi s8, sp, 24
+; RV32-NEXT:    addi s8, sp, 32
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v16, v15, v16
 ; RV32-NEXT:    vlse64.v v10, (s7), zero
@@ -1689,7 +1689,7 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; RV32-NEXT:    add s7, sp, s7
 ; RV32-NEXT:    addi s7, s7, 288
 ; RV32-NEXT:    vs1r.v v10, (s7) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    addi s7, sp, 16
+; RV32-NEXT:    addi s7, sp, 24
 ; RV32-NEXT:    vmul.vv v17, v8, v17
 ; RV32-NEXT:    vmul.vv v18, v8, v18
 ; RV32-NEXT:    vmul.vv v19, v8, v19
@@ -1714,7 +1714,7 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; RV32-NEXT:    vmul.vv v1, v8, v1
 ; RV32-NEXT:    vmul.vv v0, v8, v0
 ; RV32-NEXT:    vxor.vv v16, v16, v17
-; RV32-NEXT:    addi s9, sp, 224
+; RV32-NEXT:    addi s9, sp, 232
 ; RV32-NEXT:    vlse64.v v11, (s9), zero
 ; RV32-NEXT:    vxor.vv v16, v16, v18
 ; RV32-NEXT:    vlse64.v v10, (s6), zero
@@ -2302,79 +2302,79 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    lui a6, 8192
 ; RV32-NEXT:    lui a5, 16384
 ; RV32-NEXT:    lui a4, 32768
-; RV32-NEXT:    sw a1, 272(sp)
-; RV32-NEXT:    sw zero, 276(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    sw zero, 272(sp)
+; RV32-NEXT:    sw s2, 276(sp)
 ; RV32-NEXT:    sw zero, 264(sp)
-; RV32-NEXT:    sw s2, 268(sp)
-; RV32-NEXT:    sw zero, 256(sp)
-; RV32-NEXT:    sw a3, 260(sp)
+; RV32-NEXT:    sw a3, 268(sp)
 ; RV32-NEXT:    lui a3, 65536
-; RV32-NEXT:    sw zero, 248(sp)
-; RV32-NEXT:    sw a2, 252(sp)
+; RV32-NEXT:    sw zero, 256(sp)
+; RV32-NEXT:    sw a2, 260(sp)
 ; RV32-NEXT:    lui a2, 131072
-; RV32-NEXT:    sw zero, 240(sp)
-; RV32-NEXT:    sw s7, 244(sp)
+; RV32-NEXT:    sw zero, 248(sp)
+; RV32-NEXT:    sw s7, 252(sp)
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vi v28, v10, 2
 ; RV32-NEXT:    vand.vi v20, v10, 1
 ; RV32-NEXT:    vand.vi v30, v10, 4
 ; RV32-NEXT:    vand.vi v14, v10, 8
-; RV32-NEXT:    sw zero, 232(sp)
-; RV32-NEXT:    sw a0, 236(sp)
+; RV32-NEXT:    sw zero, 240(sp)
+; RV32-NEXT:    sw a0, 244(sp)
 ; RV32-NEXT:    vand.vx v12, v10, a0
-; RV32-NEXT:    addi s7, sp, 272
-; RV32-NEXT:    sw zero, 224(sp)
-; RV32-NEXT:    sw s6, 228(sp)
+; RV32-NEXT:    addi s7, sp, 16
+; RV32-NEXT:    sw zero, 232(sp)
+; RV32-NEXT:    sw s6, 236(sp)
 ; RV32-NEXT:    vand.vx v16, v10, s6
-; RV32-NEXT:    addi s6, sp, 264
-; RV32-NEXT:    sw zero, 216(sp)
-; RV32-NEXT:    sw s5, 220(sp)
+; RV32-NEXT:    addi s6, sp, 272
+; RV32-NEXT:    sw zero, 224(sp)
+; RV32-NEXT:    sw s5, 228(sp)
 ; RV32-NEXT:    vand.vx v18, v10, s5
-; RV32-NEXT:    addi s5, sp, 256
-; RV32-NEXT:    sw zero, 208(sp)
-; RV32-NEXT:    sw s4, 212(sp)
+; RV32-NEXT:    addi s5, sp, 264
+; RV32-NEXT:    sw zero, 216(sp)
+; RV32-NEXT:    sw s4, 220(sp)
 ; RV32-NEXT:    vand.vx v0, v10, s4
-; RV32-NEXT:    addi s4, sp, 248
-; RV32-NEXT:    sw zero, 200(sp)
-; RV32-NEXT:    sw s1, 204(sp)
+; RV32-NEXT:    addi s4, sp, 256
+; RV32-NEXT:    sw zero, 208(sp)
+; RV32-NEXT:    sw s1, 212(sp)
 ; RV32-NEXT:    vand.vx v6, v10, s1
-; RV32-NEXT:    addi s1, sp, 240
-; RV32-NEXT:    sw zero, 192(sp)
-; RV32-NEXT:    sw s0, 196(sp)
+; RV32-NEXT:    addi s1, sp, 248
+; RV32-NEXT:    sw zero, 200(sp)
+; RV32-NEXT:    sw s0, 204(sp)
 ; RV32-NEXT:    vand.vx v4, v10, s0
-; RV32-NEXT:    addi s0, sp, 232
-; RV32-NEXT:    sw zero, 184(sp)
-; RV32-NEXT:    sw t5, 188(sp)
+; RV32-NEXT:    addi s0, sp, 240
+; RV32-NEXT:    sw zero, 192(sp)
+; RV32-NEXT:    sw t5, 196(sp)
 ; RV32-NEXT:    vand.vx v2, v10, t5
 ; RV32-NEXT:    slli s2, s2, 11
 ; RV32-NEXT:    vand.vx v24, v10, ra
+; RV32-NEXT:    sw zero, 184(sp)
+; RV32-NEXT:    sw s2, 188(sp)
 ; RV32-NEXT:    sw zero, 176(sp)
-; RV32-NEXT:    sw s2, 180(sp)
-; RV32-NEXT:    sw zero, 168(sp)
-; RV32-NEXT:    sw ra, 172(sp)
-; RV32-NEXT:    addi t5, sp, 216
+; RV32-NEXT:    sw ra, 180(sp)
+; RV32-NEXT:    addi t5, sp, 224
 ; RV32-NEXT:    vand.vx v26, v10, s8
-; RV32-NEXT:    sw zero, 160(sp)
-; RV32-NEXT:    sw s8, 164(sp)
-; RV32-NEXT:    addi s8, sp, 208
+; RV32-NEXT:    sw zero, 168(sp)
+; RV32-NEXT:    sw s8, 172(sp)
+; RV32-NEXT:    addi s8, sp, 216
 ; RV32-NEXT:    vand.vx v22, v10, s10
-; RV32-NEXT:    sw zero, 152(sp)
-; RV32-NEXT:    sw s10, 156(sp)
-; RV32-NEXT:    addi s10, sp, 200
+; RV32-NEXT:    sw zero, 160(sp)
+; RV32-NEXT:    sw s10, 164(sp)
+; RV32-NEXT:    addi s10, sp, 208
 ; RV32-NEXT:    vmul.vv v28, v8, v28
 ; RV32-NEXT:    vmul.vv v20, v8, v20
 ; RV32-NEXT:    vxor.vi v20, v20, 0
 ; RV32-NEXT:    vxor.vv v20, v20, v28
 ; RV32-NEXT:    vand.vx v28, v10, s11
-; RV32-NEXT:    sw zero, 144(sp)
-; RV32-NEXT:    sw s11, 148(sp)
-; RV32-NEXT:    addi s11, sp, 192
+; RV32-NEXT:    sw zero, 152(sp)
+; RV32-NEXT:    sw s11, 156(sp)
+; RV32-NEXT:    addi s11, sp, 200
 ; RV32-NEXT:    vmul.vv v30, v8, v30
 ; RV32-NEXT:    vxor.vv v20, v20, v30
 ; RV32-NEXT:    vand.vx v30, v10, s9
-; RV32-NEXT:    sw zero, 136(sp)
-; RV32-NEXT:    sw s9, 140(sp)
-; RV32-NEXT:    addi s9, sp, 184
+; RV32-NEXT:    sw zero, 144(sp)
+; RV32-NEXT:    sw s9, 148(sp)
+; RV32-NEXT:    addi s9, sp, 192
 ; RV32-NEXT:    vmul.vv v14, v8, v14
 ; RV32-NEXT:    vxor.vv v14, v20, v14
 ; RV32-NEXT:    vand.vx v20, v10, s3
@@ -2386,39 +2386,39 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 288
 ; RV32-NEXT:    vs2r.v v20, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    sw zero, 128(sp)
-; RV32-NEXT:    sw s3, 132(sp)
-; RV32-NEXT:    addi s3, sp, 176
+; RV32-NEXT:    sw zero, 136(sp)
+; RV32-NEXT:    sw s3, 140(sp)
+; RV32-NEXT:    addi s3, sp, 184
 ; RV32-NEXT:    vmul.vv v12, v8, v12
 ; RV32-NEXT:    vxor.vv v12, v14, v12
 ; RV32-NEXT:    vand.vx v14, v10, t6
-; RV32-NEXT:    sw zero, 120(sp)
-; RV32-NEXT:    sw t6, 124(sp)
-; RV32-NEXT:    addi t6, sp, 168
+; RV32-NEXT:    sw zero, 128(sp)
+; RV32-NEXT:    sw t6, 132(sp)
+; RV32-NEXT:    addi t6, sp, 176
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v12, v12, v16
 ; RV32-NEXT:    vand.vx v16, v10, t4
-; RV32-NEXT:    sw zero, 112(sp)
-; RV32-NEXT:    sw t4, 116(sp)
-; RV32-NEXT:    addi t4, sp, 160
+; RV32-NEXT:    sw zero, 120(sp)
+; RV32-NEXT:    sw t4, 124(sp)
+; RV32-NEXT:    addi t4, sp, 168
 ; RV32-NEXT:    vmul.vv v18, v8, v18
 ; RV32-NEXT:    vxor.vv v18, v12, v18
 ; RV32-NEXT:    vand.vx v12, v10, t3
-; RV32-NEXT:    sw zero, 104(sp)
-; RV32-NEXT:    sw t3, 108(sp)
-; RV32-NEXT:    addi t3, sp, 152
+; RV32-NEXT:    sw zero, 112(sp)
+; RV32-NEXT:    sw t3, 116(sp)
+; RV32-NEXT:    addi t3, sp, 160
 ; RV32-NEXT:    vmul.vv v20, v8, v0
 ; RV32-NEXT:    vxor.vv v18, v18, v20
 ; RV32-NEXT:    vand.vx v20, v10, t2
-; RV32-NEXT:    sw zero, 96(sp)
-; RV32-NEXT:    sw t2, 100(sp)
-; RV32-NEXT:    addi t2, sp, 144
+; RV32-NEXT:    sw zero, 104(sp)
+; RV32-NEXT:    sw t2, 108(sp)
+; RV32-NEXT:    addi t2, sp, 152
 ; RV32-NEXT:    vmul.vv v6, v8, v6
 ; RV32-NEXT:    vxor.vv v18, v18, v6
 ; RV32-NEXT:    vand.vx v6, v10, t1
-; RV32-NEXT:    sw zero, 88(sp)
-; RV32-NEXT:    sw t1, 92(sp)
-; RV32-NEXT:    addi t1, sp, 136
+; RV32-NEXT:    sw zero, 96(sp)
+; RV32-NEXT:    sw t1, 100(sp)
+; RV32-NEXT:    addi t1, sp, 144
 ; RV32-NEXT:    vmul.vv v4, v8, v4
 ; RV32-NEXT:    vxor.vv v18, v18, v4
 ; RV32-NEXT:    vand.vx v4, v10, t0
@@ -2432,55 +2432,55 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 288
 ; RV32-NEXT:    vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    sw zero, 80(sp)
-; RV32-NEXT:    sw t0, 84(sp)
-; RV32-NEXT:    addi t0, sp, 128
+; RV32-NEXT:    sw zero, 88(sp)
+; RV32-NEXT:    sw t0, 92(sp)
+; RV32-NEXT:    addi t0, sp, 136
 ; RV32-NEXT:    vmul.vv v2, v8, v2
 ; RV32-NEXT:    vxor.vv v18, v18, v2
 ; RV32-NEXT:    vand.vx v2, v10, s2
-; RV32-NEXT:    addi ra, sp, 120
+; RV32-NEXT:    addi ra, sp, 128
 ; RV32-NEXT:    vmul.vv v2, v8, v2
 ; RV32-NEXT:    vxor.vv v18, v18, v2
 ; RV32-NEXT:    vand.vx v2, v10, a7
-; RV32-NEXT:    sw zero, 72(sp)
-; RV32-NEXT:    sw a7, 76(sp)
-; RV32-NEXT:    addi a7, sp, 112
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw a7, 84(sp)
+; RV32-NEXT:    addi a7, sp, 120
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    vxor.vv v18, v18, v24
 ; RV32-NEXT:    vand.vx v4, v10, a6
-; RV32-NEXT:    sw zero, 64(sp)
-; RV32-NEXT:    sw a6, 68(sp)
-; RV32-NEXT:    addi a6, sp, 104
+; RV32-NEXT:    sw zero, 72(sp)
+; RV32-NEXT:    sw a6, 76(sp)
+; RV32-NEXT:    addi a6, sp, 112
 ; RV32-NEXT:    vmul.vv v26, v8, v26
 ; RV32-NEXT:    vxor.vv v18, v18, v26
 ; RV32-NEXT:    vand.vx v26, v10, a5
-; RV32-NEXT:    sw zero, 56(sp)
-; RV32-NEXT:    sw a5, 60(sp)
-; RV32-NEXT:    addi a5, sp, 96
+; RV32-NEXT:    sw zero, 64(sp)
+; RV32-NEXT:    sw a5, 68(sp)
+; RV32-NEXT:    addi a5, sp, 104
 ; RV32-NEXT:    vmul.vv v22, v8, v22
 ; RV32-NEXT:    vxor.vv v18, v18, v22
 ; RV32-NEXT:    vand.vx v24, v10, a4
-; RV32-NEXT:    sw zero, 48(sp)
-; RV32-NEXT:    sw a4, 52(sp)
-; RV32-NEXT:    addi a4, sp, 88
+; RV32-NEXT:    sw zero, 56(sp)
+; RV32-NEXT:    sw a4, 60(sp)
+; RV32-NEXT:    addi a4, sp, 96
 ; RV32-NEXT:    vmul.vv v28, v8, v28
 ; RV32-NEXT:    vxor.vv v18, v18, v28
 ; RV32-NEXT:    vand.vx v28, v10, a3
-; RV32-NEXT:    sw zero, 40(sp)
-; RV32-NEXT:    sw a3, 44(sp)
-; RV32-NEXT:    addi a3, sp, 80
+; RV32-NEXT:    sw zero, 48(sp)
+; RV32-NEXT:    sw a3, 52(sp)
+; RV32-NEXT:    addi a3, sp, 88
 ; RV32-NEXT:    vmul.vv v30, v8, v30
 ; RV32-NEXT:    vxor.vv v18, v18, v30
 ; RV32-NEXT:    vand.vx v30, v10, a2
+; RV32-NEXT:    sw zero, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    addi a2, sp, 80
 ; RV32-NEXT:    sw zero, 32(sp)
-; RV32-NEXT:    sw a2, 36(sp)
-; RV32-NEXT:    addi a2, sp, 72
-; RV32-NEXT:    sw zero, 24(sp)
 ; RV32-NEXT:    lui a0, 262144
-; RV32-NEXT:    sw a0, 28(sp)
-; RV32-NEXT:    sw zero, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    sw a0, 36(sp)
+; RV32-NEXT:    sw zero, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    addi a1, sp, 72
 ; RV32-NEXT:    sw a6, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    csrr a6, vlenb
 ; RV32-NEXT:    slli a6, a6, 3
@@ -2501,7 +2501,7 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 288
 ; RV32-NEXT:    vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    addi s7, sp, 56
+; RV32-NEXT:    addi s7, sp, 64
 ; RV32-NEXT:    vmul.vv v14, v8, v14
 ; RV32-NEXT:    vxor.vv v14, v0, v14
 ; RV32-NEXT:    vlse64.v v18, (s6), zero
@@ -2513,7 +2513,7 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 288
 ; RV32-NEXT:    vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    addi s2, sp, 48
+; RV32-NEXT:    addi s2, sp, 56
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v14, v14, v16
 ; RV32-NEXT:    vlse64.v v16, (s5), zero
@@ -2525,7 +2525,7 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 288
 ; RV32-NEXT:    vs2r.v v16, (a6) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    addi s5, sp, 40
+; RV32-NEXT:    addi s5, sp, 48
 ; RV32-NEXT:    vmul.vv v12, v8, v12
 ; RV32-NEXT:    vxor.vv v12, v14, v12
 ; RV32-NEXT:    vlse64.v v14, (s4), zero
@@ -2534,7 +2534,7 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 288
 ; RV32-NEXT:    vs2r.v v14, (a6) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    addi s4, sp, 32
+; RV32-NEXT:    addi s4, sp, 40
 ; RV32-NEXT:    vmul.vv v20, v8, v20
 ; RV32-NEXT:    vxor.vv v20, v12, v20
 ; RV32-NEXT:    vlse64.v v12, (s1), zero
@@ -2550,7 +2550,7 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 288
 ; RV32-NEXT:    vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    addi s1, sp, 24
+; RV32-NEXT:    addi s1, sp, 32
 ; RV32-NEXT:    vmul.vv v6, v8, v6
 ; RV32-NEXT:    vxor.vv v20, v20, v6
 ; RV32-NEXT:    vlse64.v v12, (s0), zero
@@ -2564,7 +2564,7 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 288
 ; RV32-NEXT:    vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
-; RV32-NEXT:    addi s0, sp, 16
+; RV32-NEXT:    addi s0, sp, 24
 ; RV32-NEXT:    csrr s6, vlenb
 ; RV32-NEXT:    slli s6, s6, 1
 ; RV32-NEXT:    mv a6, s6
@@ -2584,7 +2584,7 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; RV32-NEXT:    vmul.vv v28, v8, v28
 ; RV32-NEXT:    vmul.vv v30, v8, v30
 ; RV32-NEXT:    vxor.vv v20, v20, v6
-; RV32-NEXT:    addi s6, sp, 224
+; RV32-NEXT:    addi s6, sp, 232
 ; RV32-NEXT:    vlse64.v v0, (s6), zero
 ; RV32-NEXT:    vxor.vv v20, v20, v2
 ; RV32-NEXT:    vlse64.v v6, (t5), zero
@@ -3433,25 +3433,25 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    lui a6, 8192
 ; RV32-NEXT:    lui a5, 16384
 ; RV32-NEXT:    lui a4, 32768
-; RV32-NEXT:    sw a1, 272(sp)
-; RV32-NEXT:    sw zero, 276(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    sw zero, 272(sp)
+; RV32-NEXT:    sw s4, 276(sp)
 ; RV32-NEXT:    sw zero, 264(sp)
-; RV32-NEXT:    sw s4, 268(sp)
-; RV32-NEXT:    sw zero, 256(sp)
-; RV32-NEXT:    sw a3, 260(sp)
+; RV32-NEXT:    sw a3, 268(sp)
 ; RV32-NEXT:    lui a3, 65536
-; RV32-NEXT:    sw zero, 248(sp)
-; RV32-NEXT:    sw a2, 252(sp)
+; RV32-NEXT:    sw zero, 256(sp)
+; RV32-NEXT:    sw a2, 260(sp)
 ; RV32-NEXT:    lui a2, 131072
-; RV32-NEXT:    sw zero, 240(sp)
-; RV32-NEXT:    sw a0, 244(sp)
+; RV32-NEXT:    sw zero, 248(sp)
+; RV32-NEXT:    sw a0, 252(sp)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vi v28, v12, 2
 ; RV32-NEXT:    vand.vi v4, v12, 1
 ; RV32-NEXT:    vand.vi v24, v12, 4
 ; RV32-NEXT:    vand.vi v20, v12, 8
-; RV32-NEXT:    sw zero, 232(sp)
-; RV32-NEXT:    sw s3, 236(sp)
+; RV32-NEXT:    sw zero, 240(sp)
+; RV32-NEXT:    sw s3, 244(sp)
 ; RV32-NEXT:    vand.vx v16, v12, s3
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 2
@@ -3465,33 +3465,33 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 288
 ; RV32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32-NEXT:    addi s3, sp, 272
-; RV32-NEXT:    sw zero, 224(sp)
-; RV32-NEXT:    sw s2, 228(sp)
+; RV32-NEXT:    addi s3, sp, 16
+; RV32-NEXT:    sw zero, 232(sp)
+; RV32-NEXT:    sw s2, 236(sp)
 ; RV32-NEXT:    vand.vx v0, v12, s2
-; RV32-NEXT:    addi s2, sp, 264
-; RV32-NEXT:    sw zero, 216(sp)
-; RV32-NEXT:    sw s5, 220(sp)
+; RV32-NEXT:    addi s2, sp, 272
+; RV32-NEXT:    sw zero, 224(sp)
+; RV32-NEXT:    sw s5, 228(sp)
 ; RV32-NEXT:    vmul.vv v16, v8, v28
 ; RV32-NEXT:    vmul.vv v28, v8, v4
 ; RV32-NEXT:    vxor.vi v28, v28, 0
 ; RV32-NEXT:    vxor.vv v28, v28, v16
 ; RV32-NEXT:    vand.vx v16, v12, s5
-; RV32-NEXT:    addi s5, sp, 256
-; RV32-NEXT:    sw zero, 208(sp)
-; RV32-NEXT:    sw s6, 212(sp)
+; RV32-NEXT:    addi s5, sp, 264
+; RV32-NEXT:    sw zero, 216(sp)
+; RV32-NEXT:    sw s6, 220(sp)
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    vxor.vv v28, v28, v24
 ; RV32-NEXT:    vand.vx v24, v12, s6
-; RV32-NEXT:    addi s6, sp, 248
-; RV32-NEXT:    sw zero, 200(sp)
-; RV32-NEXT:    sw s8, 204(sp)
+; RV32-NEXT:    addi s6, sp, 256
+; RV32-NEXT:    sw zero, 208(sp)
+; RV32-NEXT:    sw s8, 212(sp)
 ; RV32-NEXT:    vmul.vv v20, v8, v20
 ; RV32-NEXT:    vxor.vv v20, v28, v20
 ; RV32-NEXT:    vand.vx v28, v12, s8
-; RV32-NEXT:    addi s8, sp, 240
-; RV32-NEXT:    sw zero, 192(sp)
-; RV32-NEXT:    sw s1, 196(sp)
+; RV32-NEXT:    addi s8, sp, 248
+; RV32-NEXT:    sw zero, 200(sp)
+; RV32-NEXT:    sw s1, 204(sp)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 2
 ; RV32-NEXT:    mv a1, a0
@@ -3507,8 +3507,8 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    vmul.vv v4, v8, v4
 ; RV32-NEXT:    vxor.vv v20, v20, v4
 ; RV32-NEXT:    vand.vx v4, v12, s1
-; RV32-NEXT:    sw zero, 184(sp)
-; RV32-NEXT:    sw s7, 188(sp)
+; RV32-NEXT:    sw zero, 192(sp)
+; RV32-NEXT:    sw s7, 196(sp)
 ; RV32-NEXT:    vmul.vv v0, v8, v0
 ; RV32-NEXT:    vxor.vv v20, v20, v0
 ; RV32-NEXT:    vand.vx v0, v12, s7
@@ -3516,87 +3516,87 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v20, v20, v16
 ; RV32-NEXT:    vand.vx v16, v12, ra
+; RV32-NEXT:    sw zero, 184(sp)
+; RV32-NEXT:    sw a0, 188(sp)
 ; RV32-NEXT:    sw zero, 176(sp)
-; RV32-NEXT:    sw a0, 180(sp)
-; RV32-NEXT:    sw zero, 168(sp)
-; RV32-NEXT:    sw ra, 172(sp)
-; RV32-NEXT:    addi s4, sp, 216
+; RV32-NEXT:    sw ra, 180(sp)
+; RV32-NEXT:    addi s4, sp, 224
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    vxor.vv v24, v20, v24
 ; RV32-NEXT:    vand.vx v20, v12, s11
-; RV32-NEXT:    sw zero, 160(sp)
-; RV32-NEXT:    sw s11, 164(sp)
-; RV32-NEXT:    addi s11, sp, 208
+; RV32-NEXT:    sw zero, 168(sp)
+; RV32-NEXT:    sw s11, 172(sp)
+; RV32-NEXT:    addi s11, sp, 216
 ; RV32-NEXT:    vmul.vv v28, v8, v28
 ; RV32-NEXT:    vxor.vv v28, v24, v28
 ; RV32-NEXT:    vand.vx v24, v12, s10
-; RV32-NEXT:    sw zero, 152(sp)
-; RV32-NEXT:    sw s10, 156(sp)
-; RV32-NEXT:    addi s10, sp, 200
+; RV32-NEXT:    sw zero, 160(sp)
+; RV32-NEXT:    sw s10, 164(sp)
+; RV32-NEXT:    addi s10, sp, 208
 ; RV32-NEXT:    vmul.vv v4, v8, v4
 ; RV32-NEXT:    vxor.vv v4, v28, v4
 ; RV32-NEXT:    vand.vx v28, v12, s9
-; RV32-NEXT:    sw zero, 144(sp)
-; RV32-NEXT:    sw s9, 148(sp)
-; RV32-NEXT:    addi s9, sp, 192
+; RV32-NEXT:    sw zero, 152(sp)
+; RV32-NEXT:    sw s9, 156(sp)
+; RV32-NEXT:    addi s9, sp, 200
 ; RV32-NEXT:    vmul.vv v0, v8, v0
 ; RV32-NEXT:    vxor.vv v4, v4, v0
 ; RV32-NEXT:    vand.vx v0, v12, a0
-; RV32-NEXT:    addi ra, sp, 184
+; RV32-NEXT:    addi ra, sp, 192
 ; RV32-NEXT:    vmul.vv v0, v8, v0
 ; RV32-NEXT:    vxor.vv v0, v4, v0
 ; RV32-NEXT:    vand.vx v4, v12, s0
-; RV32-NEXT:    sw zero, 136(sp)
-; RV32-NEXT:    sw s0, 140(sp)
-; RV32-NEXT:    addi s1, sp, 176
+; RV32-NEXT:    sw zero, 144(sp)
+; RV32-NEXT:    sw s0, 148(sp)
+; RV32-NEXT:    addi s1, sp, 184
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v0, v0, v16
 ; RV32-NEXT:    vand.vx v16, v12, t6
-; RV32-NEXT:    sw zero, 128(sp)
-; RV32-NEXT:    sw t6, 132(sp)
-; RV32-NEXT:    addi s0, sp, 168
+; RV32-NEXT:    sw zero, 136(sp)
+; RV32-NEXT:    sw t6, 140(sp)
+; RV32-NEXT:    addi s0, sp, 176
 ; RV32-NEXT:    vmul.vv v20, v8, v20
 ; RV32-NEXT:    vxor.vv v0, v0, v20
 ; RV32-NEXT:    vand.vx v20, v12, t5
-; RV32-NEXT:    sw zero, 120(sp)
-; RV32-NEXT:    sw t5, 124(sp)
-; RV32-NEXT:    addi t6, sp, 160
+; RV32-NEXT:    sw zero, 128(sp)
+; RV32-NEXT:    sw t5, 132(sp)
+; RV32-NEXT:    addi t6, sp, 168
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    vxor.vv v0, v0, v24
 ; RV32-NEXT:    vand.vx v24, v12, t4
-; RV32-NEXT:    sw zero, 112(sp)
-; RV32-NEXT:    sw t4, 116(sp)
-; RV32-NEXT:    addi t5, sp, 152
+; RV32-NEXT:    sw zero, 120(sp)
+; RV32-NEXT:    sw t4, 124(sp)
+; RV32-NEXT:    addi t5, sp, 160
 ; RV32-NEXT:    vmul.vv v28, v8, v28
 ; RV32-NEXT:    vxor.vv v0, v0, v28
 ; RV32-NEXT:    vand.vx v28, v12, t3
-; RV32-NEXT:    sw zero, 104(sp)
-; RV32-NEXT:    sw t3, 108(sp)
-; RV32-NEXT:    addi t4, sp, 144
+; RV32-NEXT:    sw zero, 112(sp)
+; RV32-NEXT:    sw t3, 116(sp)
+; RV32-NEXT:    addi t4, sp, 152
 ; RV32-NEXT:    vmul.vv v4, v8, v4
 ; RV32-NEXT:    vxor.vv v0, v0, v4
 ; RV32-NEXT:    vand.vx v4, v12, t2
-; RV32-NEXT:    sw zero, 96(sp)
-; RV32-NEXT:    sw t2, 100(sp)
-; RV32-NEXT:    addi t3, sp, 136
+; RV32-NEXT:    sw zero, 104(sp)
+; RV32-NEXT:    sw t2, 108(sp)
+; RV32-NEXT:    addi t3, sp, 144
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v16, v0, v16
 ; RV32-NEXT:    vand.vx v0, v12, t1
-; RV32-NEXT:    sw zero, 88(sp)
-; RV32-NEXT:    sw t1, 92(sp)
-; RV32-NEXT:    addi t2, sp, 128
+; RV32-NEXT:    sw zero, 96(sp)
+; RV32-NEXT:    sw t1, 100(sp)
+; RV32-NEXT:    addi t2, sp, 136
 ; RV32-NEXT:    vmul.vv v20, v8, v20
 ; RV32-NEXT:    vxor.vv v20, v16, v20
 ; RV32-NEXT:    vand.vx v16, v12, t0
-; RV32-NEXT:    sw zero, 80(sp)
-; RV32-NEXT:    sw t0, 84(sp)
-; RV32-NEXT:    addi t1, sp, 120
+; RV32-NEXT:    sw zero, 88(sp)
+; RV32-NEXT:    sw t0, 92(sp)
+; RV32-NEXT:    addi t1, sp, 128
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    vxor.vv v24, v20, v24
 ; RV32-NEXT:    vand.vx v20, v12, a7
-; RV32-NEXT:    sw zero, 72(sp)
-; RV32-NEXT:    sw a7, 76(sp)
-; RV32-NEXT:    addi t0, sp, 112
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw a7, 84(sp)
+; RV32-NEXT:    addi t0, sp, 120
 ; RV32-NEXT:    vmul.vv v28, v8, v28
 ; RV32-NEXT:    vxor.vv v24, v24, v28
 ; RV32-NEXT:    vand.vx v28, v12, a6
@@ -3612,9 +3612,9 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 288
 ; RV32-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32-NEXT:    sw zero, 64(sp)
-; RV32-NEXT:    sw a6, 68(sp)
-; RV32-NEXT:    addi a7, sp, 104
+; RV32-NEXT:    sw zero, 72(sp)
+; RV32-NEXT:    sw a6, 76(sp)
+; RV32-NEXT:    addi a7, sp, 112
 ; RV32-NEXT:    vmul.vv v28, v8, v4
 ; RV32-NEXT:    vxor.vv v24, v24, v28
 ; RV32-NEXT:    vand.vx v28, v12, a5
@@ -3628,34 +3628,34 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 288
 ; RV32-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
-; RV32-NEXT:    sw zero, 56(sp)
-; RV32-NEXT:    sw a5, 60(sp)
-; RV32-NEXT:    addi a6, sp, 96
+; RV32-NEXT:    sw zero, 64(sp)
+; RV32-NEXT:    sw a5, 68(sp)
+; RV32-NEXT:    addi a6, sp, 104
 ; RV32-NEXT:    vmul.vv v28, v8, v0
 ; RV32-NEXT:    vxor.vv v28, v24, v28
 ; RV32-NEXT:    vand.vx v24, v12, a4
-; RV32-NEXT:    sw zero, 48(sp)
-; RV32-NEXT:    sw a4, 52(sp)
-; RV32-NEXT:    addi a5, sp, 88
+; RV32-NEXT:    sw zero, 56(sp)
+; RV32-NEXT:    sw a4, 60(sp)
+; RV32-NEXT:    addi a5, sp, 96
 ; RV32-NEXT:    vmul.vv v16, v8, v16
 ; RV32-NEXT:    vxor.vv v16, v28, v16
 ; RV32-NEXT:    vand.vx v28, v12, a3
-; RV32-NEXT:    sw zero, 40(sp)
-; RV32-NEXT:    sw a3, 44(sp)
-; RV32-NEXT:    addi a4, sp, 80
+; RV32-NEXT:    sw zero, 48(sp)
+; RV32-NEXT:    sw a3, 52(sp)
+; RV32-NEXT:    addi a4, sp, 88
 ; RV32-NEXT:    vmul.vv v20, v8, v20
 ; RV32-NEXT:    vxor.vv v16, v16, v20
 ; RV32-NEXT:    vand.vx v4, v12, a2
+; RV32-NEXT:    sw zero, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    addi a3, sp, 80
 ; RV32-NEXT:    sw zero, 32(sp)
-; RV32-NEXT:    sw a2, 36(sp)
-; RV32-NEXT:    addi a3, sp, 72
-; RV32-NEXT:    sw zero, 24(sp)
 ; RV32-NEXT:    lui a1, 262144
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw zero, 16(sp)
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw zero, 24(sp)
 ; RV32-NEXT:    lui a0, 524288
-; RV32-NEXT:    sw a0, 20(sp)
-; RV32-NEXT:    addi a2, sp, 64
+; RV32-NEXT:    sw a0, 28(sp)
+; RV32-NEXT:    addi a2, sp, 72
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 2
 ; RV32-NEXT:    mv s7, a0
@@ -3671,7 +3671,7 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    vmul.vv v20, v8, v20
 ; RV32-NEXT:    vxor.vv v20, v16, v20
 ; RV32-NEXT:    vlse64.v v16, (s3), zero
-; RV32-NEXT:    addi s3, sp, 56
+; RV32-NEXT:    addi s3, sp, 64
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    mv s7, a0
@@ -3685,15 +3685,15 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    vmul.vv v0, v8, v0
 ; RV32-NEXT:    vxor.vv v0, v20, v0
 ; RV32-NEXT:    vlse64.v v20, (s2), zero
-; RV32-NEXT:    addi s2, sp, 48
+; RV32-NEXT:    addi s2, sp, 56
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    vxor.vv v0, v0, v24
 ; RV32-NEXT:    vlse64.v v24, (s5), zero
-; RV32-NEXT:    addi s5, sp, 40
+; RV32-NEXT:    addi s5, sp, 48
 ; RV32-NEXT:    vmul.vv v28, v8, v28
 ; RV32-NEXT:    vxor.vv v0, v0, v28
 ; RV32-NEXT:    vlse64.v v28, (s6), zero
-; RV32-NEXT:    addi s6, sp, 32
+; RV32-NEXT:    addi s6, sp, 40
 ; RV32-NEXT:    vmul.vv v4, v8, v4
 ; RV32-NEXT:    vxor.vv v4, v0, v4
 ; RV32-NEXT:    csrr a0, vlenb
@@ -3709,7 +3709,7 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    addi a0, a0, 288
 ; RV32-NEXT:    vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    vlse64.v v4, (s8), zero
-; RV32-NEXT:    addi s8, sp, 24
+; RV32-NEXT:    addi s8, sp, 32
 ; RV32-NEXT:    vand.vv v16, v12, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
@@ -3765,10 +3765,10 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 288
 ; RV32-NEXT:    vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    addi s7, sp, 232
+; RV32-NEXT:    addi a0, sp, 24
+; RV32-NEXT:    addi s7, sp, 240
 ; RV32-NEXT:    vlse64.v v16, (s7), zero
-; RV32-NEXT:    addi s7, sp, 224
+; RV32-NEXT:    addi s7, sp, 232
 ; RV32-NEXT:    vlse64.v v20, (s7), zero
 ; RV32-NEXT:    vlse64.v v24, (s4), zero
 ; RV32-NEXT:    vlse64.v v28, (s11), zero
diff --git a/llvm/test/CodeGen/X86/clmul-vector-256.ll b/llvm/test/CodeGen/X86/clmul-vector-256.ll
index 303aeeae3f5df..3c0929f70560a 100644
--- a/llvm/test/CodeGen/X86/clmul-vector-256.ll
+++ b/llvm/test/CodeGen/X86/clmul-vector-256.ll
@@ -8,19 +8,46 @@ define <32 x i8> @clmul_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: clmul_v32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm3
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm6
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vandps %ymm3, %ymm6, %ymm6
+; AVX1-NEXT:    vandnps %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaddubsw %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpsllw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vandnps %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpsllw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vorps %ymm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
-; AVX1-NEXT:    vpmullw %xmm6, %xmm2, %xmm3
-; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm7
-; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT:    vpandn %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpmullw %xmm6, %xmm2, %xmm7
+; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm8
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT:    vandps %ymm3, %ymm7, %ymm7
+; AVX1-NEXT:    vandnps %xmm5, %xmm3, %xmm5
 ; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpsllw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpandn %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vandnps %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vorps %ymm5, %ymm7, %ymm5
+; AVX1-NEXT:    vxorps %ymm4, %ymm5, %ymm4
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm5
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm2, %xmm7
+; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm8
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT:    vandps %ymm3, %ymm7, %ymm7
+; AVX1-NEXT:    vandnps %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpsllw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vandnps %xmm6, %xmm3, %xmm6
 ; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
@@ -30,101 +57,72 @@ define <32 x i8> @clmul_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-NEXT:    vpmullw %xmm7, %xmm2, %xmm8
 ; AVX1-NEXT:    vpmullw %xmm6, %xmm0, %xmm9
 ; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm8
-; AVX1-NEXT:    vpandn %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vandps %ymm3, %ymm8, %ymm8
+; AVX1-NEXT:    vandnps %xmm6, %xmm3, %xmm6
 ; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm0, %xmm6
 ; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpandn %xmm7, %xmm3, %xmm7
+; AVX1-NEXT:    vandnps %xmm7, %xmm3, %xmm7
 ; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm2, %xmm7
 ; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
 ; AVX1-NEXT:    vorps %ymm6, %ymm8, %ymm6
-; AVX1-NEXT:    vxorps %ymm5, %ymm6, %ymm5
+; AVX1-NEXT:    vxorps %ymm6, %ymm5, %ymm5
+; AVX1-NEXT:    vxorps %ymm5, %ymm4, %ymm4
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm5
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm2, %xmm7
+; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm8
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT:    vandps %ymm3, %ymm7, %ymm7
+; AVX1-NEXT:    vandnps %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpsllw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vandnps %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vorps %ymm5, %ymm7, %ymm5
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
 ; AVX1-NEXT:    vpmullw %xmm7, %xmm2, %xmm8
 ; AVX1-NEXT:    vpmullw %xmm6, %xmm0, %xmm9
 ; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm8
-; AVX1-NEXT:    vpandn %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vandps %ymm3, %ymm8, %ymm8
+; AVX1-NEXT:    vandnps %xmm6, %xmm3, %xmm6
 ; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm0, %xmm6
 ; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpandn %xmm7, %xmm3, %xmm7
+; AVX1-NEXT:    vandnps %xmm7, %xmm3, %xmm7
 ; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm2, %xmm7
 ; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
 ; AVX1-NEXT:    vorps %ymm6, %ymm8, %ymm6
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vpmullw %xmm2, %xmm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX1-NEXT:    vandps %ymm4, %ymm9, %ymm9
-; AVX1-NEXT:    vpandn %xmm7, %xmm3, %xmm7
-; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
-; AVX1-NEXT:    vpandn %xmm8, %xmm3, %xmm8
-; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm2, %xmm8
-; AVX1-NEXT:    vpsllw $8, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-NEXT:    vorps %ymm7, %ymm9, %ymm7
-; AVX1-NEXT:    vxorps %ymm7, %ymm6, %ymm6
 ; AVX1-NEXT:    vxorps %ymm6, %ymm5, %ymm5
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
 ; AVX1-NEXT:    vpmullw %xmm7, %xmm2, %xmm8
 ; AVX1-NEXT:    vpmullw %xmm6, %xmm0, %xmm9
 ; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm8
-; AVX1-NEXT:    vpandn %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vandps %ymm3, %ymm8, %ymm8
+; AVX1-NEXT:    vandnps %xmm6, %xmm3, %xmm6
 ; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm0, %xmm6
 ; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpandn %xmm7, %xmm3, %xmm7
+; AVX1-NEXT:    vandnps %xmm7, %xmm3, %xmm7
 ; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm2, %xmm7
 ; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
 ; AVX1-NEXT:    vorps %ymm6, %ymm8, %ymm6
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vpmullw %xmm2, %xmm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX1-NEXT:    vandps %ymm4, %ymm9, %ymm9
-; AVX1-NEXT:    vpandn %xmm7, %xmm3, %xmm7
-; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
-; AVX1-NEXT:    vpandn %xmm8, %xmm3, %xmm8
-; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm2, %xmm8
-; AVX1-NEXT:    vpsllw $8, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-NEXT:    vorps %ymm7, %ymm9, %ymm7
-; AVX1-NEXT:    vxorps %ymm7, %ymm6, %ymm6
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vpmullw %xmm2, %xmm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX1-NEXT:    vandps %ymm4, %ymm9, %ymm4
-; AVX1-NEXT:    vpandn %xmm7, %xmm3, %xmm7
-; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
-; AVX1-NEXT:    vpandn %xmm8, %xmm3, %xmm8
-; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm2, %xmm8
-; AVX1-NEXT:    vpsllw $8, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-NEXT:    vorps %ymm7, %ymm4, %ymm4
-; AVX1-NEXT:    vxorps %ymm4, %ymm6, %ymm4
-; AVX1-NEXT:    vxorps %ymm4, %ymm5, %ymm4
+; AVX1-NEXT:    vxorps %ymm6, %ymm5, %ymm5
+; AVX1-NEXT:    vxorps %ymm5, %ymm4, %ymm4
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm6
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm7
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX1-NEXT:    vandps %ymm3, %ymm6, %ymm6
-; AVX1-NEXT:    vpandn %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vandnps %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm5, %xmm3, %xmm1
+; AVX1-NEXT:    vandnps %xmm5, %xmm3, %xmm1
 ; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -612,172 +610,181 @@ define <4 x i64> @clmul_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <32 x i8> @clmulr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: clmulr_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa %ymm1, %ymm3
+; AVX1-NEXT:    vmovdqa %ymm0, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm6
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm5
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpmullw %xmm1, %xmm9, %xmm6
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm7
-; AVX1-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm7, %xmm0
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm7 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX1-NEXT:    vandps %ymm7, %ymm6, %ymm10
-; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT:    vpandn %xmm8, %xmm6, %xmm8
-; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm0, %xmm8
-; AVX1-NEXT:    vpsllw $8, %xmm8, %xmm8
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm1, %xmm9
-; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vorps %ymm8, %ymm10, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm1, %xmm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX1-NEXT:    vandps %ymm7, %ymm11, %ymm11
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm0, %xmm9
-; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm1, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vorps %ymm9, %ymm11, %ymm9
-; AVX1-NEXT:    vxorps %ymm8, %ymm9, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm1, %xmm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX1-NEXT:    vandps %ymm7, %ymm11, %ymm11
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm0, %xmm9
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm14
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm10
+; AVX1-NEXT:    vpsrlw $4, %xmm10, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm11
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm14, %xmm8
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm14, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11, %xmm9
+; AVX1-NEXT:    vpmullw %xmm9, %xmm14, %xmm12
+; AVX1-NEXT:    vpand %xmm5, %xmm12, %xmm12
+; AVX1-NEXT:    vpandn %xmm9, %xmm5, %xmm9
+; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm14, %xmm9
 ; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm1, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vorps %ymm9, %ymm11, %ymm9
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm1, %xmm11, %xmm12
-; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm13
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX1-NEXT:    vandps %ymm7, %ymm12, %ymm12
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm0, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vpandn %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm1, %xmm11
-; AVX1-NEXT:    vpsllw $8, %xmm11, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vorps %ymm10, %ymm12, %ymm10
-; AVX1-NEXT:    vxorps %ymm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm1, %xmm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX1-NEXT:    vandps %ymm7, %ymm11, %ymm11
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm0, %xmm9
+; AVX1-NEXT:    vpor %xmm9, %xmm12, %xmm9
+; AVX1-NEXT:    vpxor %xmm8, %xmm9, %xmm12
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX1-NEXT:    vpand %xmm8, %xmm11, %xmm9
+; AVX1-NEXT:    vpmullw %xmm9, %xmm14, %xmm13
+; AVX1-NEXT:    vpand %xmm5, %xmm13, %xmm13
+; AVX1-NEXT:    vpandn %xmm9, %xmm5, %xmm9
+; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm14, %xmm9
 ; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm1, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vorps %ymm9, %ymm11, %ymm9
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm1, %xmm11, %xmm12
-; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm13
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX1-NEXT:    vandps %ymm7, %ymm12, %ymm12
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm0, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vpandn %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm1, %xmm11
+; AVX1-NEXT:    vpor %xmm9, %xmm13, %xmm13
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11, %xmm11
+; AVX1-NEXT:    vpmullw %xmm11, %xmm14, %xmm15
+; AVX1-NEXT:    vpand %xmm5, %xmm15, %xmm15
+; AVX1-NEXT:    vpandn %xmm11, %xmm5, %xmm11
+; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm14, %xmm11
 ; AVX1-NEXT:    vpsllw $8, %xmm11, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vorps %ymm10, %ymm12, %ymm10
-; AVX1-NEXT:    vxorps %ymm10, %ymm9, %ymm9
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm1, %xmm11, %xmm12
-; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm13
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX1-NEXT:    vandps %ymm7, %ymm12, %ymm7
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm0, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vpandn %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm1, %xmm11
+; AVX1-NEXT:    vpor %xmm11, %xmm15, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm13, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm12, %xmm13
+; AVX1-NEXT:    vpand %xmm2, %xmm10, %xmm10
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm9
+; AVX1-NEXT:    vpshufb %xmm10, %xmm0, %xmm15
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm10 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpand %xmm10, %xmm15, %xmm11
+; AVX1-NEXT:    vpmullw %xmm11, %xmm14, %xmm12
+; AVX1-NEXT:    vpand %xmm5, %xmm12, %xmm12
+; AVX1-NEXT:    vpandn %xmm11, %xmm5, %xmm11
+; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm14, %xmm11
 ; AVX1-NEXT:    vpsllw $8, %xmm11, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vorps %ymm7, %ymm10, %ymm7
-; AVX1-NEXT:    vxorps %ymm7, %ymm9, %ymm7
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
-; AVX1-NEXT:    vpmullw %xmm1, %xmm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX1-NEXT:    vandps %ymm6, %ymm9, %ymm9
-; AVX1-NEXT:    vpandn %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm11, %xmm12, %xmm12
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT:    vpand %xmm11, %xmm15, %xmm0
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm6
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpandn %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm14, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm8, %xmm6, %xmm5
-; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm0, %ymm9, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm7, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpor %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm12, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm12 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpand %xmm12, %xmm15, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm14, %xmm7
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm14, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm13, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpand %xmm13, %xmm15, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm14, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm14, %xmm6
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm6
+; AVX1-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm6, %xmm14
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm15
+; AVX1-NEXT:    vpand %xmm5, %xmm15, %xmm15
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm15, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpand %xmm1, %xmm8, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm10, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm11, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm12, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm13, %xmm3
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpandn %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: clmulr_v32i8:
@@ -873,64 +880,66 @@ define <32 x i8> @clmulr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512-LABEL: clmulr_v32i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vpand %ymm6, %ymm2, %ymm7
+; AVX512-NEXT:    vpmullw %ymm7, %ymm5, %ymm7
+; AVX512-NEXT:    vpxor %ymm4, %ymm7, %ymm4
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512-NEXT:    vpand %ymm7, %ymm2, %ymm8
+; AVX512-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512-NEXT:    vpand %ymm2, %ymm9, %ymm10
+; AVX512-NEXT:    vpmullw %ymm5, %ymm10, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm4 ^ ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpand %ymm4, %ymm2, %ymm8
+; AVX512-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512-NEXT:    vpand %ymm2, %ymm11, %ymm12
+; AVX512-NEXT:    vpmullw %ymm5, %ymm12, %ymm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ ymm10 ^ ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpand %ymm2, %ymm8, %ymm10
+; AVX512-NEXT:    vpmullw %ymm5, %ymm10, %ymm10
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpand %ymm2, %ymm13, %ymm2
+; AVX512-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ ymm12 ^ ymm10
+; AVX512-NEXT:    vpsrlw $7, %ymm2, %ymm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512-NEXT:    vpmovdb %zmm2, %xmm2
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmullw %ymm4, %ymm1, %ymm4
+; AVX512-NEXT:    vpand %ymm3, %ymm1, %ymm3
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512-NEXT:    vpmullw %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm5
-; AVX512-NEXT:    vpmullw %ymm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpxorq %zmm3, %zmm4, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm5
-; AVX512-NEXT:    vpmullw %ymm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm5
-; AVX512-NEXT:    vextracti64x4 $1, %zmm5, %ymm6
-; AVX512-NEXT:    vpmullw %ymm6, %ymm1, %ymm6
+; AVX512-NEXT:    vpand %ymm6, %ymm1, %ymm5
+; AVX512-NEXT:    vpmullw %ymm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpxor %ymm3, %ymm5, %ymm3
+; AVX512-NEXT:    vpand %ymm7, %ymm1, %ymm5
 ; AVX512-NEXT:    vpmullw %ymm5, %ymm0, %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ zmm3 ^ zmm4
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512-NEXT:    vpmullw %ymm4, %ymm1, %ymm4
+; AVX512-NEXT:    vpand %ymm1, %ymm9, %ymm6
+; AVX512-NEXT:    vpmullw %ymm6, %ymm0, %ymm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ ymm3 ^ ymm5
+; AVX512-NEXT:    vpand %ymm4, %ymm1, %ymm3
 ; AVX512-NEXT:    vpmullw %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
-; AVX512-NEXT:    vpmullw %ymm6, %ymm1, %ymm6
+; AVX512-NEXT:    vpand %ymm1, %ymm11, %ymm4
 ; AVX512-NEXT:    vpmullw %ymm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ zmm5 ^ zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ ymm6 ^ ymm3
+; AVX512-NEXT:    vpand %ymm1, %ymm8, %ymm3
 ; AVX512-NEXT:    vpmullw %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
-; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ zmm4 ^ zmm3
-; AVX512-NEXT:    vpsrlw $7, %ymm0, %ymm1
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpand %ymm1, %ymm13, %ymm1
+; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ ymm4 ^ ymm3
 ; AVX512-NEXT:    vpsrlw $7, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; AVX512-NEXT:    retq
   %a.ext = zext <32 x i8> %a to <32 x i16>
   %b.ext = zext <32 x i8> %b to <32 x i16>
@@ -943,230 +952,232 @@ define <32 x i8> @clmulr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <16 x i16> @clmulr_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: clmulr_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX1-NEXT:    vmovdqa %ymm0, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm12 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm7
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX1-NEXT:    vpshufb %xmm7, %xmm5, %xmm7
-; AVX1-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm7
-; AVX1-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm7, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm2, %xmm6, %xmm9
-; AVX1-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vpsrlw $4, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm2, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpor %xmm6, %xmm9, %xmm6
-; AVX1-NEXT:    vpmullw %xmm6, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm9
-; AVX1-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm6
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm8, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm7, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm2, %xmm7, %xmm7
+; AVX1-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2]
+; AVX1-NEXT:    vpand %xmm7, %xmm14, %xmm9
+; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm9
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm9, %xmm10, %xmm9
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm13 = [2048,2048,2048,2048,2048,2048,2048,2048]
+; AVX1-NEXT:    vpand %xmm7, %xmm13, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm3, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm11 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; AVX1-NEXT:    vpand %xmm0, %xmm11, %xmm9
+; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm9
+; AVX1-NEXT:    vpxor %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm10 = [8192,8192,8192,8192,8192,8192,8192,8192]
+; AVX1-NEXT:    vpand %xmm0, %xmm10, %xmm9
+; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm9
+; AVX1-NEXT:    vpxor %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm9 = [16384,16384,16384,16384,16384,16384,16384,16384]
+; AVX1-NEXT:    vpand %xmm0, %xmm9, %xmm15
+; AVX1-NEXT:    vpmullw %xmm3, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm7, %xmm15, %xmm15
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm15, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vpshufb %xmm12, %xmm4, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb %xmm12, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm1
 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm14, %xmm5
+; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm5, %xmm15, %xmm5
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm14
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm14
+; AVX1-NEXT:    vpxor %xmm14, %xmm15, %xmm14
+; AVX1-NEXT:    vpxor %xmm5, %xmm14, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm14
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpxor %xmm5, %xmm14, %xmm5
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm14
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpxor %xmm5, %xmm14, %xmm5
+; AVX1-NEXT:    vpand %xmm1, %xmm13, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm11, %xmm11
+; AVX1-NEXT:    vpmullw %xmm0, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm1, %xmm11, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm10, %xmm10
+; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm1, %xmm10, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm9, %xmm9
+; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
+; AVX1-NEXT:    vpxor %xmm1, %xmm9, %xmm1
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: clmulr_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm0, %ymm3
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm0, %ymm3, %ymm4
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX2-NEXT:    vpsrlw $4, %ymm3, %ymm3
-; AVX2-NEXT:    vpand %ymm0, %ymm3, %ymm6
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm5
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
 ; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
-; AVX2-NEXT:    vpor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX2-NEXT:    vpsrlw $4, %ymm5, %ymm1
+; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
+; AVX2-NEXT:    vpsrlw $4, %ymm4, %ymm4
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm6
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
+; AVX2-NEXT:    vpor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm6
+; AVX2-NEXT:    vpsrlw $4, %ymm6, %ymm1
 ; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6
-; AVX2-NEXT:    vpmullw %ymm6, %ymm4, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX2-NEXT:    vpmullw %ymm7, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm6
+; AVX2-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX2-NEXT:    vpmullw %ymm7, %ymm4, %ymm7
+; AVX2-NEXT:    vpmullw %ymm7, %ymm5, %ymm7
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
 ; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpxor %ymm7, %ymm6, %ymm6
-; AVX2-NEXT:    vpand %ymm0, %ymm5, %ymm5
-; AVX2-NEXT:    vpshufb %ymm5, %ymm2, %ymm5
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7
-; AVX2-NEXT:    vpmullw %ymm7, %ymm4, %ymm7
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
-; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpand %ymm0, %ymm6, %ymm6
+; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpxor %ymm7, %ymm6, %ymm6
-; AVX2-NEXT:    vpsllw $8, %ymm6, %ymm7
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX2-NEXT:    vpmullw %ymm4, %ymm9, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
 ; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX2-NEXT:    vpmullw %ymm4, %ymm9, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
 ; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm6
-; AVX2-NEXT:    vpmullw %ymm6, %ymm4, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
-; AVX2-NEXT:    vpxor %ymm6, %ymm8, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
-; AVX2-NEXT:    vpxor %ymm6, %ymm8, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
-; AVX2-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm6, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm4
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm7, %ymm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7
+; AVX2-NEXT:    vpmullw %ymm7, %ymm5, %ymm7
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; AVX2-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1586,172 +1597,181 @@ define <4 x i64> @clmulr_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <32 x i8> @clmulh_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: clmulh_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa %ymm1, %ymm3
+; AVX1-NEXT:    vmovdqa %ymm0, %ymm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
 ; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm6
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm5
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpmullw %xmm1, %xmm9, %xmm6
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm7
-; AVX1-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm7, %xmm0
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm7 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX1-NEXT:    vandps %ymm7, %ymm6, %ymm10
-; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT:    vpandn %xmm8, %xmm6, %xmm8
-; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm0, %xmm8
-; AVX1-NEXT:    vpsllw $8, %xmm8, %xmm8
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm1, %xmm9
-; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vorps %ymm8, %ymm10, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm1, %xmm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX1-NEXT:    vandps %ymm7, %ymm11, %ymm11
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm0, %xmm9
-; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm1, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vorps %ymm9, %ymm11, %ymm9
-; AVX1-NEXT:    vxorps %ymm8, %ymm9, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm1, %xmm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX1-NEXT:    vandps %ymm7, %ymm11, %ymm11
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm0, %xmm9
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm14
+; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm10, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm14, %xmm8
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm14, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm10, %xmm9
+; AVX1-NEXT:    vpmullw %xmm9, %xmm14, %xmm11
+; AVX1-NEXT:    vpand %xmm5, %xmm11, %xmm11
+; AVX1-NEXT:    vpandn %xmm9, %xmm5, %xmm9
+; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm14, %xmm9
 ; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm1, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vorps %ymm9, %ymm11, %ymm9
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm1, %xmm11, %xmm12
-; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm13
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX1-NEXT:    vandps %ymm7, %ymm12, %ymm12
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm0, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vpandn %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm1, %xmm11
-; AVX1-NEXT:    vpsllw $8, %xmm11, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vorps %ymm10, %ymm12, %ymm10
-; AVX1-NEXT:    vxorps %ymm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm1, %xmm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX1-NEXT:    vandps %ymm7, %ymm11, %ymm11
-; AVX1-NEXT:    vpandn %xmm9, %xmm6, %xmm9
-; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm0, %xmm9
+; AVX1-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX1-NEXT:    vpxor %xmm8, %xmm9, %xmm11
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX1-NEXT:    vpand %xmm8, %xmm10, %xmm9
+; AVX1-NEXT:    vpmullw %xmm9, %xmm14, %xmm12
+; AVX1-NEXT:    vpand %xmm5, %xmm12, %xmm12
+; AVX1-NEXT:    vpandn %xmm9, %xmm5, %xmm9
+; AVX1-NEXT:    vpmaddubsw %xmm9, %xmm14, %xmm9
 ; AVX1-NEXT:    vpsllw $8, %xmm9, %xmm9
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm1, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vorps %ymm9, %ymm11, %ymm9
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm1, %xmm11, %xmm12
-; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm13
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX1-NEXT:    vandps %ymm7, %ymm12, %ymm12
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm0, %xmm10
-; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vpandn %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm1, %xmm11
-; AVX1-NEXT:    vpsllw $8, %xmm11, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vorps %ymm10, %ymm12, %ymm10
-; AVX1-NEXT:    vxorps %ymm10, %ymm9, %ymm9
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm11
-; AVX1-NEXT:    vpmullw %xmm1, %xmm11, %xmm12
-; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm13
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX1-NEXT:    vandps %ymm7, %ymm12, %ymm7
-; AVX1-NEXT:    vpandn %xmm10, %xmm6, %xmm10
-; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm0, %xmm10
+; AVX1-NEXT:    vpor %xmm9, %xmm12, %xmm12
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm10, %xmm10
+; AVX1-NEXT:    vpmullw %xmm10, %xmm14, %xmm13
+; AVX1-NEXT:    vpand %xmm5, %xmm13, %xmm13
+; AVX1-NEXT:    vpandn %xmm10, %xmm5, %xmm10
+; AVX1-NEXT:    vpmaddubsw %xmm10, %xmm14, %xmm10
 ; AVX1-NEXT:    vpsllw $8, %xmm10, %xmm10
-; AVX1-NEXT:    vpandn %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm1, %xmm11
+; AVX1-NEXT:    vpor %xmm10, %xmm13, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm12, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm11, %xmm13
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm10
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm9
+; AVX1-NEXT:    vpshufb %xmm10, %xmm0, %xmm15
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm10 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpand %xmm10, %xmm15, %xmm11
+; AVX1-NEXT:    vpmullw %xmm11, %xmm14, %xmm12
+; AVX1-NEXT:    vpand %xmm5, %xmm12, %xmm12
+; AVX1-NEXT:    vpandn %xmm11, %xmm5, %xmm11
+; AVX1-NEXT:    vpmaddubsw %xmm11, %xmm14, %xmm11
 ; AVX1-NEXT:    vpsllw $8, %xmm11, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vorps %ymm7, %ymm10, %ymm7
-; AVX1-NEXT:    vxorps %ymm7, %ymm9, %ymm7
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
-; AVX1-NEXT:    vpmullw %xmm1, %xmm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX1-NEXT:    vandps %ymm6, %ymm9, %ymm9
-; AVX1-NEXT:    vpandn %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm11, %xmm12, %xmm12
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT:    vpand %xmm11, %xmm15, %xmm0
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm6
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpandn %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm14, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm8, %xmm6, %xmm5
-; AVX1-NEXT:    vpmaddubsw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm0, %ymm9, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm7, %ymm0
+; AVX1-NEXT:    vpor %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm12, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm12 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpand %xmm12, %xmm15, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm14, %xmm7
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm14, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm13, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpand %xmm13, %xmm15, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm14, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm14, %xmm6
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm6
+; AVX1-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm6, %xmm14
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm15
+; AVX1-NEXT:    vpand %xmm5, %xmm15, %xmm15
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm15, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm10, %xmm6
+; AVX1-NEXT:    vpmullw %xmm6, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpandn %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpmaddubsw %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpand %xmm1, %xmm11, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpand %xmm1, %xmm12, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm8
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; AVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmaddubsw %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpsllw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm1, %xmm13, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpandn %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm14, %xmm1
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1851,65 +1871,64 @@ define <32 x i8> @clmulh_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ;
 ; AVX512-LABEL: clmulh_v32i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vpand %ymm6, %ymm2, %ymm7
+; AVX512-NEXT:    vpmullw %ymm7, %ymm5, %ymm7
+; AVX512-NEXT:    vpxor %ymm4, %ymm7, %ymm4
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512-NEXT:    vpand %ymm7, %ymm2, %ymm8
+; AVX512-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512-NEXT:    vpand %ymm2, %ymm9, %ymm10
+; AVX512-NEXT:    vpmullw %ymm5, %ymm10, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm4 ^ ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpand %ymm4, %ymm2, %ymm8
+; AVX512-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512-NEXT:    vpand %ymm2, %ymm11, %ymm12
+; AVX512-NEXT:    vpmullw %ymm5, %ymm12, %ymm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ ymm10 ^ ymm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpand %ymm2, %ymm8, %ymm10
+; AVX512-NEXT:    vpmullw %ymm5, %ymm10, %ymm10
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpand %ymm2, %ymm13, %ymm2
+; AVX512-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ ymm12 ^ ymm10
+; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmullw %ymm4, %ymm1, %ymm4
+; AVX512-NEXT:    vpand %ymm3, %ymm1, %ymm3
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512-NEXT:    vpmullw %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm5
-; AVX512-NEXT:    vpmullw %ymm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpxorq %zmm3, %zmm4, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm5
-; AVX512-NEXT:    vpmullw %ymm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm5
-; AVX512-NEXT:    vextracti64x4 $1, %zmm5, %ymm6
-; AVX512-NEXT:    vpmullw %ymm6, %ymm1, %ymm6
+; AVX512-NEXT:    vpand %ymm6, %ymm1, %ymm5
 ; AVX512-NEXT:    vpmullw %ymm5, %ymm0, %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ zmm3 ^ zmm4
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512-NEXT:    vpmullw %ymm4, %ymm1, %ymm4
+; AVX512-NEXT:    vpxor %ymm3, %ymm5, %ymm3
+; AVX512-NEXT:    vpand %ymm7, %ymm1, %ymm5
+; AVX512-NEXT:    vpmullw %ymm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpand %ymm1, %ymm9, %ymm6
+; AVX512-NEXT:    vpmullw %ymm6, %ymm0, %ymm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ ymm3 ^ ymm5
+; AVX512-NEXT:    vpand %ymm4, %ymm1, %ymm3
 ; AVX512-NEXT:    vpmullw %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
-; AVX512-NEXT:    vpmullw %ymm6, %ymm1, %ymm6
+; AVX512-NEXT:    vpand %ymm1, %ymm11, %ymm4
 ; AVX512-NEXT:    vpmullw %ymm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ zmm5 ^ zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ ymm6 ^ ymm3
+; AVX512-NEXT:    vpand %ymm1, %ymm8, %ymm3
 ; AVX512-NEXT:    vpmullw %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
-; AVX512-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
-; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ zmm4 ^ zmm3
-; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm1
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpand %ymm1, %ymm13, %ymm1
+; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ ymm4 ^ ymm3
 ; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512-NEXT:    retq
   %a.ext = zext <32 x i8> %a to <32 x i16>
   %b.ext = zext <32 x i8> %b to <32 x i16>
@@ -1922,232 +1941,234 @@ define <32 x i8> @clmulh_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <16 x i16> @clmulh_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: clmulh_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX1-NEXT:    vmovdqa %ymm0, %ymm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm12 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT:    vpshufb %xmm12, %xmm4, %xmm0
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm7
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX1-NEXT:    vpshufb %xmm7, %xmm5, %xmm7
-; AVX1-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm7
-; AVX1-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm7, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm2, %xmm6, %xmm9
-; AVX1-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vpsrlw $4, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm2, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpor %xmm6, %xmm9, %xmm6
-; AVX1-NEXT:    vpmullw %xmm6, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm9
-; AVX1-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vxorps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm9
-; AVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm10
-; AVX1-NEXT:    vpmullw %xmm6, %xmm10, %xmm10
-; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vxorps %ymm9, %ymm8, %ymm8
-; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
-; AVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm6
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm8, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm7, %ymm0
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm12, %xmm5, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm2, %xmm7, %xmm7
+; AVX1-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2]
+; AVX1-NEXT:    vpand %xmm7, %xmm14, %xmm9
+; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm9
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm9, %xmm10, %xmm9
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm10
+; AVX1-NEXT:    vpmullw %xmm3, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm11
+; AVX1-NEXT:    vpmullw %xmm3, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm11, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm13 = [2048,2048,2048,2048,2048,2048,2048,2048]
+; AVX1-NEXT:    vpand %xmm7, %xmm13, %xmm7
+; AVX1-NEXT:    vpmullw %xmm7, %xmm3, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm11 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; AVX1-NEXT:    vpand %xmm0, %xmm11, %xmm9
+; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm9
+; AVX1-NEXT:    vpxor %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm10 = [8192,8192,8192,8192,8192,8192,8192,8192]
+; AVX1-NEXT:    vpand %xmm0, %xmm10, %xmm9
+; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm9
+; AVX1-NEXT:    vpxor %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm9 = [16384,16384,16384,16384,16384,16384,16384,16384]
+; AVX1-NEXT:    vpand %xmm0, %xmm9, %xmm15
+; AVX1-NEXT:    vpmullw %xmm3, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm7, %xmm15, %xmm15
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm15, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm0
+; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm1
+; AVX1-NEXT:    vpshufb %xmm12, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm1
 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm14, %xmm5
+; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm5, %xmm15, %xmm5
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm14
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm14
+; AVX1-NEXT:    vpxor %xmm14, %xmm15, %xmm14
+; AVX1-NEXT:    vpxor %xmm5, %xmm14, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm14
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpxor %xmm5, %xmm14, %xmm5
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm14
+; AVX1-NEXT:    vpmullw %xmm0, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm15
+; AVX1-NEXT:    vpmullw %xmm0, %xmm15, %xmm15
+; AVX1-NEXT:    vpxor %xmm15, %xmm14, %xmm14
+; AVX1-NEXT:    vpxor %xmm5, %xmm14, %xmm5
+; AVX1-NEXT:    vpand %xmm1, %xmm13, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm11, %xmm11
+; AVX1-NEXT:    vpmullw %xmm0, %xmm11, %xmm11
+; AVX1-NEXT:    vpxor %xmm1, %xmm11, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm10, %xmm10
+; AVX1-NEXT:    vpmullw %xmm0, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm1, %xmm10, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm9, %xmm9
+; AVX1-NEXT:    vpmullw %xmm0, %xmm9, %xmm9
+; AVX1-NEXT:    vpxor %xmm1, %xmm9, %xmm1
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: clmulh_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm0, %ymm3
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm0, %ymm3, %ymm4
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX2-NEXT:    vpsrlw $4, %ymm3, %ymm3
-; AVX2-NEXT:    vpand %ymm0, %ymm3, %ymm6
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm5
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
 ; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
-; AVX2-NEXT:    vpor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX2-NEXT:    vpsrlw $4, %ymm5, %ymm1
+; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
+; AVX2-NEXT:    vpsrlw $4, %ymm4, %ymm4
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm6
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
+; AVX2-NEXT:    vpor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm6
+; AVX2-NEXT:    vpsrlw $4, %ymm6, %ymm1
 ; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6
-; AVX2-NEXT:    vpmullw %ymm6, %ymm4, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX2-NEXT:    vpmullw %ymm7, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm6
+; AVX2-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
-; AVX2-NEXT:    vpmullw %ymm7, %ymm4, %ymm7
+; AVX2-NEXT:    vpmullw %ymm7, %ymm5, %ymm7
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
-; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpxor %ymm7, %ymm6, %ymm6
-; AVX2-NEXT:    vpand %ymm0, %ymm5, %ymm5
-; AVX2-NEXT:    vpshufb %ymm5, %ymm2, %ymm5
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7
-; AVX2-NEXT:    vpmullw %ymm7, %ymm4, %ymm7
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
 ; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpand %ymm0, %ymm6, %ymm6
+; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
+; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpxor %ymm7, %ymm6, %ymm6
-; AVX2-NEXT:    vpsllw $8, %ymm6, %ymm7
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX2-NEXT:    vpmullw %ymm4, %ymm9, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
 ; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm9
-; AVX2-NEXT:    vpmullw %ymm4, %ymm9, %ymm9
+; AVX2-NEXT:    vpmullw %ymm5, %ymm9, %ymm9
 ; AVX2-NEXT:    vpxor %ymm9, %ymm8, %ymm8
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm6
-; AVX2-NEXT:    vpmullw %ymm6, %ymm4, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
-; AVX2-NEXT:    vpxor %ymm6, %ymm8, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8
-; AVX2-NEXT:    vpmullw %ymm4, %ymm8, %ymm8
-; AVX2-NEXT:    vpxor %ymm6, %ymm8, %ymm6
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
-; AVX2-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm6, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm4
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm7, %ymm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7
+; AVX2-NEXT:    vpmullw %ymm7, %ymm5, %ymm7
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8
+; AVX2-NEXT:    vpmullw %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpxor %ymm7, %ymm8, %ymm7
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; AVX2-NEXT:    vpmullw %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/clmul-vector-512.ll b/llvm/test/CodeGen/X86/clmul-vector-512.ll
index 6a5f1f4b6d660..f337d9177eb80 100644
--- a/llvm/test/CodeGen/X86/clmul-vector-512.ll
+++ b/llvm/test/CodeGen/X86/clmul-vector-512.ll
@@ -303,163 +303,183 @@ define <8 x i64> @clmul_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 define <64 x i8> @clmulr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512F-LABEL: clmulr_v64i8:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpsrlw $4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm6
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
-; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm7
-; AVX512F-NEXT:    vpsrlw $4, %ymm7, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
-; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm8
-; AVX512F-NEXT:    vpand %ymm2, %ymm8, %ymm8
-; AVX512F-NEXT:    vpshufb %ymm8, %ymm3, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm8
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm10, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm11
-; AVX512F-NEXT:    vpshufb %ymm11, %ymm4, %ymm11
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm11, %ymm0
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm11
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
-; AVX512F-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm11 & zmm6)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm10
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm10, %ymm11
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm10, %ymm13
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512F-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512F-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm0, %ymm10
+; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm9
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm4, %ymm11
+; AVX512F-NEXT:    vpsrlw $4, %ymm11, %ymm5
+; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm3, %ymm12
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm9, %ymm8
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm9, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm8
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm9, %ymm13
+; AVX512F-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512F-NEXT:    vpandn %ymm10, %ymm5, %ymm10
+; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm9, %ymm10
 ; AVX512F-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512F-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512F-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512F-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm10 | zmm12)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm11
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm13
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512F-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512F-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512F-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm10 ^ (zmm9 | zmm12)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm10, %ymm11
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm12
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512F-NEXT:    vpandq %zmm6, %zmm11, %zmm11
-; AVX512F-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512F-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512F-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512F-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
+; AVX512F-NEXT:    vpor %ymm10, %ymm13, %ymm10
+; AVX512F-NEXT:    vpxor %ymm8, %ymm10, %ymm13
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT:    vpand %ymm8, %ymm12, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm9, %ymm14
+; AVX512F-NEXT:    vpand %ymm5, %ymm14, %ymm14
+; AVX512F-NEXT:    vpandn %ymm10, %ymm5, %ymm10
+; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm9, %ymm10
 ; AVX512F-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm9 ^ (zmm8 | zmm11)
-; AVX512F-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512F-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
+; AVX512F-NEXT:    vpor %ymm10, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm12
+; AVX512F-NEXT:    vpmullw %ymm12, %ymm9, %ymm15
+; AVX512F-NEXT:    vpand %ymm5, %ymm15, %ymm15
+; AVX512F-NEXT:    vpandn %ymm12, %ymm5, %ymm12
+; AVX512F-NEXT:    vpmaddubsw %ymm12, %ymm9, %ymm12
+; AVX512F-NEXT:    vpsllw $8, %ymm12, %ymm12
+; AVX512F-NEXT:    vpor %ymm12, %ymm15, %ymm12
+; AVX512F-NEXT:    vpxor %ymm12, %ymm14, %ymm12
+; AVX512F-NEXT:    vpxor %ymm12, %ymm13, %ymm14
+; AVX512F-NEXT:    vpand %ymm2, %ymm11, %ymm11
+; AVX512F-NEXT:    vmovdqa %ymm0, %ymm10
+; AVX512F-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm11 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpand %ymm11, %ymm15, %ymm12
+; AVX512F-NEXT:    vpmullw %ymm12, %ymm9, %ymm13
+; AVX512F-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512F-NEXT:    vpandn %ymm12, %ymm5, %ymm12
+; AVX512F-NEXT:    vpmaddubsw %ymm12, %ymm9, %ymm12
+; AVX512F-NEXT:    vpsllw $8, %ymm12, %ymm12
+; AVX512F-NEXT:    vpor %ymm12, %ymm13, %ymm13
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpand %ymm12, %ymm15, %ymm0
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm6
+; AVX512F-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpandn %ymm0, %ymm5, %ymm0
+; AVX512F-NEXT:    vpmaddubsw %ymm0, %ymm9, %ymm0
+; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm13, %ymm0
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm13 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512F-NEXT:    vpand %ymm13, %ymm15, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm7
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpxor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm14, %ymm0
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-NEXT:    vpand %ymm14, %ymm15, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpxor %ymm6, %ymm0, %ymm9
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512F-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm0, %ymm7
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm15
+; AVX512F-NEXT:    vpand %ymm5, %ymm15, %ymm15
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512F-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512F-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512F-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm8 | zmm10)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512F-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm15, %ymm7
+; AVX512F-NEXT:    vpxor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpand %ymm1, %ymm8, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
-; AVX512F-NEXT:    vpandq %zmm6, %zmm9, %zmm9
-; AVX512F-NEXT:    vpandn %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vpandn %ymm8, %ymm6, %ymm1
-; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm0 | zmm9)
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm5
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm8, %ymm1
+; AVX512F-NEXT:    vpxor %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT:    vpxor %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm11, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm0, %ymm7
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpand %ymm4, %ymm12, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT:    vpxor %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpand %ymm4, %ymm13, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT:    vpxor %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpxor %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm14, %ymm4
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm6
+; AVX512F-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpandn %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm9, %ymm4
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
+; AVX512F-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm10, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm9, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: clmulr_v64i8:
@@ -475,152 +495,156 @@ define <64 x i8> @clmulr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
-; AVX512VL-NEXT:    vpor %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm7
-; AVX512VL-NEXT:    vpsrlw $4, %ymm7, %ymm6
-; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
-; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm8
-; AVX512VL-NEXT:    vpand %ymm2, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpshufb %ymm8, %ymm3, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm10, %ymm6
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm11
-; AVX512VL-NEXT:    vpshufb %ymm11, %ymm4, %ymm11
+; AVX512VL-NEXT:    vpor %ymm6, %ymm5, %ymm8
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm11
+; AVX512VL-NEXT:    vpsrlw $4, %ymm11, %ymm5
+; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpshufb %ymm5, %ymm3, %ymm12
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm16 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm12, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm9 & ymm5)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm12, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm8, %ymm13
+; AVX512VL-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpandn %ymm9, %ymm5, %ymm9
+; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm14
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm10 ^ (ymm14 | ymm13)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm12, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm8, %ymm13
+; AVX512VL-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpandn %ymm10, %ymm5, %ymm10
+; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm8, %ymm10
+; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm15
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm14 ^ (ymm15 | ymm13)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpandq %ymm19, %ymm12, %ymm12
+; AVX512VL-NEXT:    vpmullw %ymm12, %ymm8, %ymm13
+; AVX512VL-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpandn %ymm12, %ymm5, %ymm12
+; AVX512VL-NEXT:    vpmaddubsw %ymm12, %ymm8, %ymm12
+; AVX512VL-NEXT:    vpsllw $8, %ymm12, %ymm12
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm12 | ymm13)
+; AVX512VL-NEXT:    vpand %ymm2, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpshufb %ymm11, %ymm4, %ymm14
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpand %ymm11, %ymm14, %ymm13
+; AVX512VL-NEXT:    vpmullw %ymm13, %ymm8, %ymm15
+; AVX512VL-NEXT:    vpand %ymm5, %ymm15, %ymm15
+; AVX512VL-NEXT:    vpandn %ymm13, %ymm5, %ymm13
+; AVX512VL-NEXT:    vpmaddubsw %ymm13, %ymm8, %ymm13
+; AVX512VL-NEXT:    vpsllw $8, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm12 ^ (ymm13 | ymm15)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpand %ymm12, %ymm14, %ymm15
+; AVX512VL-NEXT:    vpmullw %ymm15, %ymm8, %ymm6
+; AVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpandn %ymm15, %ymm5, %ymm15
+; AVX512VL-NEXT:    vpmaddubsw %ymm15, %ymm8, %ymm15
+; AVX512VL-NEXT:    vpsllw $8, %ymm15, %ymm15
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm13 ^ (ymm15 | ymm6)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT:    vpand %ymm13, %ymm14, %ymm6
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm7
+; AVX512VL-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpmaddubsw %ymm6, %ymm8, %ymm6
+; AVX512VL-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm15 ^ (ymm6 | ymm7)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512VL-NEXT:    vpand %ymm15, %ymm14, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm8, %ymm14
+; AVX512VL-NEXT:    vpand %ymm5, %ymm14, %ymm14
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm8, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 | ymm14)
+; AVX512VL-NEXT:    vpand %ymm2, %ymm8, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm14
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm11, %ymm0
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm11
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
-; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm11 & zmm6)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm10
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm10, %ymm11
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm13
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512VL-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm0, %ymm10
-; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512VL-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512VL-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512VL-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm10 | zmm12)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm11
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm13
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
+; AVX512VL-NEXT:    vpor %ymm0, %ymm6, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm9
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm9 & ymm5)
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm6, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm10
+; AVX512VL-NEXT:    vpand %ymm5, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpandn %ymm9, %ymm5, %ymm9
 ; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
 ; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512VL-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512VL-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm10 ^ (zmm9 | zmm12)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm10, %ymm11
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm12
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm11, %zmm11
-; AVX512VL-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512VL-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512VL-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
-; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm9 ^ (zmm8 | zmm11)
-; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 | ymm10)
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm10
+; AVX512VL-NEXT:    vpand %ymm5, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm9 ^ (ymm7 | ymm10)
+; AVX512VL-NEXT:    vpandq %ymm19, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm0, %ymm9
+; AVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm6 | ymm9)
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpand %ymm1, %ymm11, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm9
+; AVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512VL-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512VL-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm8 | zmm10)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm6 ^ (ymm7 | ymm9)
+; AVX512VL-NEXT:    vpand %ymm1, %ymm12, %ymm6
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm0, %ymm9
+; AVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm6 | ymm9)
+; AVX512VL-NEXT:    vpand %ymm1, %ymm13, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm9
+; AVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm8, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm10
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpandn %ymm1, %ymm6, %ymm1
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm6 ^ (ymm7 | ymm9)
+; AVX512VL-NEXT:    vpand %ymm1, %ymm15, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpandn %ymm1, %ymm5, %ymm1
 ; AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $8, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpandn %ymm8, %ymm6, %ymm1
-; AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT:    vpsllw $8, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm0 | zmm9)
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 | ymm6)
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm14, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm8, %ymm4
+; AVX512VL-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm3, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm0, %zmm4, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
   %a.ext = zext <64 x i8> %a to <64 x i16>
   %b.ext = zext <64 x i8> %b to <64 x i16>
@@ -633,292 +657,308 @@ define <64 x i8> @clmulr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <32 x i16> @clmulr_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512F-LABEL: clmulr_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm2, %ymm5
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm6
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT:    vpand %ymm2, %ymm7, %ymm1
-; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm6
-; AVX512F-NEXT:    vpsrlw $4, %ymm5, %ymm1
-; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512F-NEXT:    vpsrlw $4, %ymm7, %ymm7
-; AVX512F-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512F-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT:    vporq %zmm5, %zmm6, %zmm5
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm6, %ymm9
-; AVX512F-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
-; AVX512F-NEXT:    vpsrlw $4, %ymm6, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm1, %ymm6
-; AVX512F-NEXT:    vpor %ymm6, %ymm9, %ymm6
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm9
-; AVX512F-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm1
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm9, %ymm0
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512F-NEXT:    vpxorq %zmm7, %zmm8, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm4, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm7
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512F-NEXT:    vpand %ymm7, %ymm14, %ymm9
 ; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm7 ^ zmm8
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm9, %ymm10, %ymm9
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm9, %ymm2
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm13 = [2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048]
+; AVX512F-NEXT:    vpand %ymm7, %ymm13, %ymm7
 ; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm11 = [4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096]
+; AVX512F-NEXT:    vpand %ymm1, %ymm11, %ymm9
 ; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm7, %ymm9, %ymm7
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm10 = [8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192]
+; AVX512F-NEXT:    vpand %ymm1, %ymm10, %ymm9
 ; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm5, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm6
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ zmm8 ^ zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm3
-; AVX512F-NEXT:    vpsrlw $4, %ymm5, %ymm4
-; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
-; AVX512F-NEXT:    vpshufb %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpxor %ymm7, %ymm9, %ymm7
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm9 = [16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384]
+; AVX512F-NEXT:    vpand %ymm1, %ymm9, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm7, %ymm15, %ymm15
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm7 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512F-NEXT:    vpand %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm15, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm3, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm2
+; AVX512F-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm3
+; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm3, %ymm12, %ymm3
+; AVX512F-NEXT:    vpand %ymm3, %ymm14, %ymm4
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm4, %ymm15, %ymm4
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm15, %ymm15
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm14, %ymm14
+; AVX512F-NEXT:    vpxor %ymm14, %ymm15, %ymm14
+; AVX512F-NEXT:    vpxor %ymm4, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpshufb %ymm2, %ymm6, %ymm4
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm2
+; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm2, %ymm15, %ymm2
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm2, %ymm15, %ymm2
+; AVX512F-NEXT:    vpxor %ymm2, %ymm14, %ymm2
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm14
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpxor %ymm2, %ymm14, %ymm2
+; AVX512F-NEXT:    vpand %ymm3, %ymm13, %ymm3
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand %ymm4, %ymm11, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm3, %ymm11, %ymm3
+; AVX512F-NEXT:    vpand %ymm4, %ymm10, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm3, %ymm10, %ymm3
+; AVX512F-NEXT:    vpand %ymm4, %ymm9, %ymm9
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm9, %ymm9
+; AVX512F-NEXT:    vpxor %ymm3, %ymm9, %ymm3
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm2
+; AVX512F-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: clmulr_v32i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm2, %ymm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm6
-; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; AVX512VL-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm7
-; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm6
-; AVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm1
-; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm5
-; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512VL-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpsrlw $4, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm2, %ymm3, %ymm6
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512VL-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpsrlw $4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpand %ymm2, %ymm3, %ymm7
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm3, %ymm7
+; AVX512VL-NEXT:    vpor %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpsrlw $4, %ymm8, %ymm7
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-NEXT:    vporq %zmm5, %zmm6, %zmm5
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
-; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
-; AVX512VL-NEXT:    vpsrlw $4, %ymm6, %ymm6
-; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm1, %ymm6
-; AVX512VL-NEXT:    vpor %ymm6, %ymm9, %ymm6
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm9
-; AVX512VL-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm9, %ymm0
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpxorq %zmm7, %zmm8, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm3, %ymm7
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm7, %ymm9
 ; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm7 ^ zmm8
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpandq %ymm22, %ymm7, %ymm10
 ; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
+; AVX512VL-NEXT:    vpxor %ymm9, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm24 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VL-NEXT:    vpandq %ymm24, %ymm7, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm11
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm25 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpandq %ymm25, %ymm7, %ymm12
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm12, %ymm12
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ ymm10 ^ ymm11
+; AVX512VL-NEXT:    vpand %ymm2, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpshufb %ymm8, %ymm5, %ymm8
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm26 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpandq %ymm26, %ymm8, %ymm11
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm11, %ymm13
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpandq %ymm27, %ymm8, %ymm14
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm14, %ymm14
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ ymm12 ^ ymm13
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT:    vpand %ymm12, %ymm8, %ymm13
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm13, %ymm15
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512VL-NEXT:    vpand %ymm13, %ymm8, %ymm9
 ; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ ymm14 ^ ymm15
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm14, %ymm15
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm15, %ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [512,512,512,512,512,512,512,512,512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm15, %ymm11
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 ^ ymm9 ^ ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm16 = [1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024]
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm7, %ymm9
 ; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048]
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ ymm11 ^ ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096]
+; AVX512VL-NEXT:    vpandq %ymm19, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192]
+; AVX512VL-NEXT:    vpandq %ymm20, %ymm8, %ymm10
 ; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm7 ^ ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384]
+; AVX512VL-NEXT:    vpandq %ymm21, %ymm8, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512VL-NEXT:    vpandq %ymm23, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ ymm10 ^ ymm7
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm8
+; AVX512VL-NEXT:    vpshufb %ymm8, %ymm5, %ymm8
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm1, %ymm8
+; AVX512VL-NEXT:    vpsrlw $4, %ymm8, %ymm1
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm1, %ymm9
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm5, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm6
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ zmm8 ^ zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm3
-; AVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpandq %ymm22, %ymm1, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpxor %ymm9, %ymm10, %ymm9
+; AVX512VL-NEXT:    vpandq %ymm24, %ymm1, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpandq %ymm25, %ymm1, %ymm11
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 ^ ymm9 ^ ymm10
+; AVX512VL-NEXT:    vpand %ymm2, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpshufb %ymm8, %ymm5, %ymm8
+; AVX512VL-NEXT:    vpandq %ymm26, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandq %ymm27, %ymm8, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm11 ^ ymm9
+; AVX512VL-NEXT:    vpand %ymm12, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpand %ymm13, %ymm8, %ymm11
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 ^ ymm10 ^ ymm9
+; AVX512VL-NEXT:    vpand %ymm1, %ymm14, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpand %ymm1, %ymm15, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm11 ^ ymm9
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm1, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ ymm10 ^ ymm9
+; AVX512VL-NEXT:    vpandq %ymm19, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandq %ymm20, %ymm8, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm1 ^ ymm9
+; AVX512VL-NEXT:    vpandq %ymm21, %ymm8, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpandq %ymm23, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ ymm10 ^ ymm1
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm6, %ymm4
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpshufb %ymm4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm3, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
   %a.ext = zext <32 x i16> %a to <32 x i32>
   %b.ext = zext <32 x i16> %b to <32 x i32>
@@ -1127,163 +1167,183 @@ define <8 x i64> @clmulr_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 define <64 x i8> @clmulh_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512F-LABEL: clmulh_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX512F-NEXT:    vpsrlw $4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm4
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm4
 ; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm6
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm7
-; AVX512F-NEXT:    vpsrlw $4, %ymm7, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm8
-; AVX512F-NEXT:    vpand %ymm2, %ymm8, %ymm8
-; AVX512F-NEXT:    vpshufb %ymm8, %ymm4, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm8
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm10, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm11
-; AVX512F-NEXT:    vpshufb %ymm11, %ymm3, %ymm11
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm11, %ymm0
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm11
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
+; AVX512F-NEXT:    vpor %ymm6, %ymm5, %ymm14
+; AVX512F-NEXT:    vpsrlw $4, %ymm3, %ymm5
+; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm14, %ymm8
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm14, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm8
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm9
+; AVX512F-NEXT:    vpmullw %ymm9, %ymm14, %ymm11
+; AVX512F-NEXT:    vpand %ymm5, %ymm11, %ymm11
+; AVX512F-NEXT:    vpandn %ymm9, %ymm5, %ymm9
+; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm14, %ymm9
 ; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
-; AVX512F-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm11 & zmm6)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm10
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm10, %ymm11
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm10, %ymm13
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512F-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512F-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512F-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512F-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512F-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm10 | zmm12)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm11
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm13
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512F-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
+; AVX512F-NEXT:    vpor %ymm9, %ymm11, %ymm9
+; AVX512F-NEXT:    vpxor %ymm8, %ymm9, %ymm11
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT:    vpand %ymm8, %ymm10, %ymm9
+; AVX512F-NEXT:    vpmullw %ymm9, %ymm14, %ymm12
+; AVX512F-NEXT:    vpand %ymm5, %ymm12, %ymm12
+; AVX512F-NEXT:    vpandn %ymm9, %ymm5, %ymm9
+; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm14, %ymm9
 ; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512F-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512F-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm10 ^ (zmm9 | zmm12)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm10, %ymm11
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm12
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512F-NEXT:    vpandq %zmm6, %zmm11, %zmm11
-; AVX512F-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512F-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512F-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512F-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
+; AVX512F-NEXT:    vpor %ymm9, %ymm12, %ymm12
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm14, %ymm13
+; AVX512F-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512F-NEXT:    vpandn %ymm10, %ymm5, %ymm10
+; AVX512F-NEXT:    vpmaddubsw %ymm10, %ymm14, %ymm10
 ; AVX512F-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm9 ^ (zmm8 | zmm11)
-; AVX512F-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512F-NEXT:    vpshufb %ymm7, %ymm3, %ymm7
+; AVX512F-NEXT:    vpor %ymm10, %ymm13, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm12, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm11, %ymm13
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm10
+; AVX512F-NEXT:    vmovdqa %ymm0, %ymm9
+; AVX512F-NEXT:    vpshufb %ymm10, %ymm0, %ymm15
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm10 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpand %ymm10, %ymm15, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm14, %ymm12
+; AVX512F-NEXT:    vpand %ymm5, %ymm12, %ymm12
+; AVX512F-NEXT:    vpandn %ymm11, %ymm5, %ymm11
+; AVX512F-NEXT:    vpmaddubsw %ymm11, %ymm14, %ymm11
+; AVX512F-NEXT:    vpsllw $8, %ymm11, %ymm11
+; AVX512F-NEXT:    vpor %ymm11, %ymm12, %ymm12
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpand %ymm11, %ymm15, %ymm0
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm14, %ymm6
+; AVX512F-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpandn %ymm0, %ymm5, %ymm0
+; AVX512F-NEXT:    vpmaddubsw %ymm0, %ymm14, %ymm0
+; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm12, %ymm0
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm12 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512F-NEXT:    vpand %ymm12, %ymm15, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm14, %ymm7
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm14, %ymm6
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpxor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm13, %ymm0
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-NEXT:    vpand %ymm13, %ymm15, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm14, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm14, %ymm6
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpxor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm6
+; AVX512F-NEXT:    vpshufb %ymm6, %ymm9, %ymm6
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm6, %ymm14
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm0, %ymm7
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm15
+; AVX512F-NEXT:    vpand %ymm5, %ymm15, %ymm15
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm15, %ymm7
+; AVX512F-NEXT:    vpxor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT:    vpxor %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT:    vpxor %ymm3, %ymm6, %ymm3
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512F-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
+; AVX512F-NEXT:    vpand %ymm1, %ymm10, %ymm6
+; AVX512F-NEXT:    vpmullw %ymm6, %ymm0, %ymm7
+; AVX512F-NEXT:    vpand %ymm5, %ymm7, %ymm7
+; AVX512F-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vpand %ymm1, %ymm11, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512F-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512F-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512F-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm8 | zmm10)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512F-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT:    vpxor %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpand %ymm1, %ymm12, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512F-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512F-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512F-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512F-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512F-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512F-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
-; AVX512F-NEXT:    vpandq %zmm6, %zmm9, %zmm9
-; AVX512F-NEXT:    vpandn %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT:    vpxor %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpxor %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm1, %ymm13, %ymm1
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm6
+; AVX512F-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpandn %ymm1, %ymm5, %ymm1
 ; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vpandn %ymm8, %ymm6, %ymm1
-; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm0 | zmm9)
+; AVX512F-NEXT:    vpor %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
-; AVX512F-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm5
-; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm5
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512F-NEXT:    vpor %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpsrlw $1, %ymm14, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
@@ -1291,165 +1351,169 @@ define <64 x i8> @clmulh_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512VL-LABEL: clmulh_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm2, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
 ; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512VL-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX512VL-NEXT:    vpsrlw $4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpshufb %ymm4, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm4, %ymm6
 ; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX512VL-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512VL-NEXT:    vpor %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm7
-; AVX512VL-NEXT:    vpsrlw $4, %ymm7, %ymm6
-; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm8
-; AVX512VL-NEXT:    vpand %ymm2, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpshufb %ymm8, %ymm4, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm10, %ymm6
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm11
-; AVX512VL-NEXT:    vpshufb %ymm11, %ymm3, %ymm11
+; AVX512VL-NEXT:    vpor %ymm6, %ymm5, %ymm14
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpshufb %ymm5, %ymm4, %ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm16 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm10, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm14, %ymm8
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm14, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm9
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ymm5)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm10, %ymm8
+; AVX512VL-NEXT:    vpmullw %ymm8, %ymm14, %ymm11
+; AVX512VL-NEXT:    vpand %ymm5, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpandn %ymm8, %ymm5, %ymm8
+; AVX512VL-NEXT:    vpmaddubsw %ymm8, %ymm14, %ymm8
+; AVX512VL-NEXT:    vpsllw $8, %ymm8, %ymm12
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 | ymm11)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm10, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm14, %ymm11
+; AVX512VL-NEXT:    vpand %ymm5, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpandn %ymm9, %ymm5, %ymm9
+; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm14, %ymm9
+; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm13
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm12 ^ (ymm13 | ymm11)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm14, %ymm11
+; AVX512VL-NEXT:    vpand %ymm5, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpandn %ymm10, %ymm5, %ymm10
+; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm14, %ymm10
+; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm12
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 | ymm11)
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm10
+; AVX512VL-NEXT:    vpshufb %ymm10, %ymm3, %ymm15
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpand %ymm10, %ymm15, %ymm11
+; AVX512VL-NEXT:    vpmullw %ymm11, %ymm14, %ymm13
+; AVX512VL-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpandn %ymm11, %ymm5, %ymm11
+; AVX512VL-NEXT:    vpmaddubsw %ymm11, %ymm14, %ymm11
+; AVX512VL-NEXT:    vpsllw $8, %ymm11, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm12 ^ (ymm6 | ymm13)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpand %ymm11, %ymm15, %ymm12
+; AVX512VL-NEXT:    vpmullw %ymm12, %ymm14, %ymm13
+; AVX512VL-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpandn %ymm12, %ymm5, %ymm12
+; AVX512VL-NEXT:    vpmaddubsw %ymm12, %ymm14, %ymm12
+; AVX512VL-NEXT:    vpsllw $8, %ymm12, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm6 ^ (ymm7 | ymm13)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT:    vpand %ymm12, %ymm15, %ymm6
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm14, %ymm13
+; AVX512VL-NEXT:    vpand %ymm5, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpmaddubsw %ymm6, %ymm14, %ymm6
+; AVX512VL-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm6 | ymm13)
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512VL-NEXT:    vpand %ymm13, %ymm15, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm14, %ymm15
+; AVX512VL-NEXT:    vpand %ymm5, %ymm15, %ymm15
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm14, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm6 ^ (ymm7 | ymm15)
+; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
+; AVX512VL-NEXT:    vpsrlw $4, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpor %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsrlw $1, %ymm6, %ymm14
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm11, %ymm0
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm11
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
-; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm11 & zmm6)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm10
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm10, %ymm11
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm13
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512VL-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm0, %ymm10
-; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512VL-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512VL-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512VL-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm10 | zmm12)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm11
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm11, %ymm12
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm13
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm12, %zmm12
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm0, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpandn %ymm11, %ymm6, %ymm11
-; AVX512VL-NEXT:    vpmaddubsw %ymm11, %ymm5, %ymm11
-; AVX512VL-NEXT:    vpsllw $8, %ymm11, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm10 ^ (zmm9 | zmm12)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm8, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm10, %ymm11
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm12
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm11, %zmm11
-; AVX512VL-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512VL-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512VL-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpandn %ymm10, %ymm6, %ymm10
-; AVX512VL-NEXT:    vpmaddubsw %ymm10, %ymm5, %ymm10
-; AVX512VL-NEXT:    vpsllw $8, %ymm10, %ymm10
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm9 ^ (zmm8 | zmm11)
-; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpshufb %ymm7, %ymm3, %ymm7
+; AVX512VL-NEXT:    vpor %ymm0, %ymm6, %ymm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm15
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm15 & ymm5)
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm6, %ymm15
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm15, %ymm8
+; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpandn %ymm15, %ymm5, %ymm15
+; AVX512VL-NEXT:    vpmaddubsw %ymm15, %ymm0, %ymm15
+; AVX512VL-NEXT:    vpsllw $8, %ymm15, %ymm15
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 | ymm8)
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
+; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm15 ^ (ymm7 | ymm8)
+; AVX512VL-NEXT:    vpand %ymm6, %ymm9, %ymm6
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm0, %ymm8
+; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm6 | ymm8)
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpand %ymm1, %ymm10, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpandn %ymm8, %ymm6, %ymm8
-; AVX512VL-NEXT:    vpmaddubsw %ymm8, %ymm0, %ymm8
-; AVX512VL-NEXT:    vpsllw $8, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm8 | zmm10)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm11
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm10, %zmm10
-; AVX512VL-NEXT:    vpandn %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm6 ^ (ymm7 | ymm8)
+; AVX512VL-NEXT:    vpand %ymm1, %ymm11, %ymm6
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm0, %ymm8
+; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpandn %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw $8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm6 | ymm8)
+; AVX512VL-NEXT:    vpand %ymm1, %ymm12, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm8
+; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpandn %ymm7, %ymm5, %ymm7
 ; AVX512VL-NEXT:    vpmaddubsw %ymm7, %ymm0, %ymm7
 ; AVX512VL-NEXT:    vpsllw $8, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpandn %ymm9, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpmaddubsw %ymm9, %ymm5, %ymm9
-; AVX512VL-NEXT:    vpsllw $8, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm7 | zmm10)
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm8, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm10
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
-; AVX512VL-NEXT:    vpandq %zmm6, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpandn %ymm1, %ymm6, %ymm1
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm6 ^ (ymm7 | ymm8)
+; AVX512VL-NEXT:    vpand %ymm1, %ymm13, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpandn %ymm1, %ymm5, %ymm1
 ; AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $8, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpandn %ymm8, %ymm6, %ymm1
-; AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT:    vpsllw $8, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm0 | zmm9)
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 | ymm6)
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm5
-; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512VL-NEXT:    vpor %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm5
-; AVX512VL-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm14, %zmm0
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
   %a.ext = zext <64 x i8> %a to <64 x i16>
@@ -1463,296 +1527,312 @@ define <64 x i8> @clmulh_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <32 x i16> @clmulh_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512F-LABEL: clmulh_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm12, %ymm4, %ymm0
 ; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm6
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT:    vpand %ymm2, %ymm7, %ymm1
-; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm6
-; AVX512F-NEXT:    vpsrlw $4, %ymm5, %ymm1
-; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512F-NEXT:    vpsrlw $4, %ymm7, %ymm7
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm12, %ymm5, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm7
 ; AVX512F-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512F-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT:    vporq %zmm5, %zmm6, %zmm5
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm6, %ymm9
-; AVX512F-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
-; AVX512F-NEXT:    vpsrlw $4, %ymm6, %ymm6
-; AVX512F-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm1, %ymm6
-; AVX512F-NEXT:    vpor %ymm6, %ymm9, %ymm6
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm9
-; AVX512F-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
+; AVX512F-NEXT:    vpshufb %ymm7, %ymm6, %ymm7
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512F-NEXT:    vpand %ymm7, %ymm14, %ymm9
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm9, %ymm9
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm9, %ymm10, %ymm9
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm10, %ymm9, %ymm1
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm13 = [2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048]
+; AVX512F-NEXT:    vpand %ymm7, %ymm13, %ymm7
+; AVX512F-NEXT:    vpmullw %ymm7, %ymm3, %ymm7
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm11 = [4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096]
+; AVX512F-NEXT:    vpand %ymm0, %ymm11, %ymm9
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm9, %ymm9
+; AVX512F-NEXT:    vpxor %ymm7, %ymm9, %ymm7
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm10 = [8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192]
+; AVX512F-NEXT:    vpand %ymm0, %ymm10, %ymm9
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm9, %ymm9
+; AVX512F-NEXT:    vpxor %ymm7, %ymm9, %ymm7
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm9 = [16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384]
+; AVX512F-NEXT:    vpand %ymm0, %ymm9, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm3, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm7, %ymm15, %ymm15
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm7 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512F-NEXT:    vpand %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm15, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm9, %ymm0
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512F-NEXT:    vpxorq %zmm7, %zmm8, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm7 ^ zmm8
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm3
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm4, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm5, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm12, %ymm1, %ymm4
+; AVX512F-NEXT:    vpsrlw $4, %ymm4, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT:    vpand %ymm1, %ymm14, %ymm5
+; AVX512F-NEXT:    vpmullw %ymm5, %ymm0, %ymm5
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm5, %ymm15, %ymm5
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm14
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm14, %ymm14
+; AVX512F-NEXT:    vpxor %ymm14, %ymm15, %ymm14
+; AVX512F-NEXT:    vpxor %ymm5, %ymm14, %ymm5
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm14
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpxor %ymm5, %ymm14, %ymm5
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm14
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm15
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm15, %ymm15
+; AVX512F-NEXT:    vpxor %ymm15, %ymm14, %ymm14
+; AVX512F-NEXT:    vpxor %ymm5, %ymm14, %ymm5
+; AVX512F-NEXT:    vpand %ymm1, %ymm13, %ymm1
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm11, %ymm11
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm11, %ymm11
+; AVX512F-NEXT:    vpxor %ymm1, %ymm11, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm10, %ymm10
+; AVX512F-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512F-NEXT:    vpxor %ymm1, %ymm10, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm9, %ymm9
 ; AVX512F-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
-; AVX512F-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm5, %ymm9
-; AVX512F-NEXT:    vpmullw %ymm6, %ymm9, %ymm6
-; AVX512F-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ zmm8 ^ zmm7
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm5
-; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm6
-; AVX512F-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512F-NEXT:    vpsrlw $4, %ymm5, %ymm5
-; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm5
-; AVX512F-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512F-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT:    vpsrlw $1, %ymm5, %ymm5
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpxor %ymm1, %ymm9, %ymm1
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpxor %ymm0, %ymm5, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: clmulh_v32i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm2, %ymm5
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm0, %ymm5
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm6
 ; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
 ; AVX512VL-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm7
-; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm6
-; AVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm1
-; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm5
-; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512VL-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpsrlw $4, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm7
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512VL-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpor %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm8
+; AVX512VL-NEXT:    vpsrlw $4, %ymm8, %ymm7
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-NEXT:    vporq %zmm5, %zmm6, %zmm5
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm5, %ymm7
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VL-NEXT:    vpandq %ymm22, %ymm7, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpandq %ymm23, %ymm7, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpxor %ymm9, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm24 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VL-NEXT:    vpandq %ymm24, %ymm7, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm11
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm25 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpandq %ymm25, %ymm7, %ymm12
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm12, %ymm12
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ ymm10 ^ ymm11
+; AVX512VL-NEXT:    vpand %ymm2, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpshufb %ymm8, %ymm4, %ymm8
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm26 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpandq %ymm26, %ymm8, %ymm11
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm11, %ymm13
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpandq %ymm27, %ymm8, %ymm14
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm14, %ymm14
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ ymm12 ^ ymm13
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT:    vpand %ymm12, %ymm8, %ymm13
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm13, %ymm15
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512VL-NEXT:    vpand %ymm13, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ ymm14 ^ ymm15
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm14, %ymm15
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm15, %ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [512,512,512,512,512,512,512,512,512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm15, %ymm11
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 ^ ymm9 ^ ymm10
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm16 = [1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024]
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm7, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048,2048]
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ ymm11 ^ ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096,4096]
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192,8192]
+; AVX512VL-NEXT:    vpandq %ymm19, %ymm8, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm7 ^ ymm9
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384,16384]
+; AVX512VL-NEXT:    vpandq %ymm20, %ymm8, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512VL-NEXT:    vpandq %ymm21, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ ymm10 ^ ymm7
 ; AVX512VL-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
-; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm9
-; AVX512VL-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
+; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm6, %ymm6
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm6, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm1, %ymm6
-; AVX512VL-NEXT:    vpor %ymm6, %ymm9, %ymm6
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpshufb %ymm6, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512VL-NEXT:    vpsrlw $1, %ymm6, %ymm6
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512VL-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm9
-; AVX512VL-NEXT:    vpshufb %ymm9, %ymm4, %ymm9
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm9, %ymm0
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpxorq %zmm7, %zmm8, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm7
+; AVX512VL-NEXT:    vpsrlw $4, %ymm7, %ymm1
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vpandq %ymm22, %ymm1, %ymm8
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpandq %ymm23, %ymm1, %ymm9
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm7 ^ zmm8
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpxor %ymm8, %ymm9, %ymm8
+; AVX512VL-NEXT:    vpandq %ymm24, %ymm1, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpandq %ymm25, %ymm1, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm8 ^ ymm9
+; AVX512VL-NEXT:    vpand %ymm2, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpandq %ymm26, %ymm7, %ymm8
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpandq %ymm27, %ymm7, %ymm9
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ ymm10 ^ ymm8
+; AVX512VL-NEXT:    vpand %ymm7, %ymm12, %ymm8
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm9
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpand %ymm7, %ymm13, %ymm10
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ ymm9 ^ ymm8
+; AVX512VL-NEXT:    vpand %ymm1, %ymm14, %ymm8
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpand %ymm1, %ymm15, %ymm9
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ zmm8 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm8
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm8, %ymm10
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ ymm10 ^ ymm8
+; AVX512VL-NEXT:    vpandq %ymm16, %ymm1, %ymm8
 ; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ zmm9 ^ zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm7, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm5, %ymm9
-; AVX512VL-NEXT:    vpmullw %ymm6, %ymm9, %ymm6
-; AVX512VL-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ zmm8 ^ zmm7
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm0, %ymm5
-; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm6
-; AVX512VL-NEXT:    vpshufb %ymm6, %ymm4, %ymm6
-; AVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpand %ymm2, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-NEXT:    vpsrlw $1, %ymm5, %ymm5
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT:    vpandq %ymm17, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ ymm9 ^ ymm8
+; AVX512VL-NEXT:    vpandq %ymm18, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpandq %ymm19, %ymm7, %ymm9
+; AVX512VL-NEXT:    vpmullw %ymm0, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ ymm1 ^ ymm8
+; AVX512VL-NEXT:    vpandq %ymm20, %ymm7, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpandq %ymm21, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpmullw %ymm7, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ ymm9 ^ ymm1
 ; AVX512VL-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
 ; AVX512VL-NEXT:    retq
   %a.ext = zext <32 x i16> %a to <32 x i32>
   %b.ext = zext <32 x i16> %b to <32 x i32>
diff --git a/llvm/test/CodeGen/X86/clmul-vector.ll b/llvm/test/CodeGen/X86/clmul-vector.ll
index dc0d2bc0fa20f..8f26f84c01883 100644
--- a/llvm/test/CodeGen/X86/clmul-vector.ll
+++ b/llvm/test/CodeGen/X86/clmul-vector.ll
@@ -1382,13 +1382,12 @@ define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: clmulr_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllw $4, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm4
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $2, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
@@ -1405,12 +1404,11 @@ define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    psllw $4, %xmm7
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm7, %xmm5
-; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
 ; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    psllw $4, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-NEXT:    psrlw $2, %xmm1
@@ -1512,11 +1510,11 @@ define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    packuswb %xmm8, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psllw $4, %xmm1
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
@@ -1682,6 +1680,7 @@ define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -1733,24 +1732,24 @@ define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    psrlw $4, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [3855,3855,3855,3855,3855,3855,3855,3855]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    psllw $4, %xmm0
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $2, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    pand %xmm3, %xmm4
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    psrlw $1, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [21845,21845,21845,21845,21845,21845,21845,21845]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NEXT:    pand %xmm4, %xmm5
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    paddw %xmm0, %xmm0
+; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    psrlw $8, %xmm5
@@ -1769,70 +1768,72 @@ define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    psllw $2, %xmm5
 ; SSE2-NEXT:    por %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    paddw %xmm1, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2,2,2,2,2,2,2,2]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4]
 ; SSE2-NEXT:    pand %xmm1, %xmm6
 ; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm5
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1]
-; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm1, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4,4,4,4,4,4,4,4]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2,2,2,2,2,2,2,2]
 ; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
 ; SSE2-NEXT:    pxor %xmm7, %xmm8
-; SSE2-NEXT:    pxor %xmm6, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8]
-; SSE2-NEXT:    pand %xmm1, %xmm6
-; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8]
 ; SSE2-NEXT:    pand %xmm5, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
 ; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    pand %xmm1, %xmm6
 ; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [64,64,64,64,64,64,64,64]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32]
 ; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
 ; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64]
+; SSE2-NEXT:    pand %xmm1, %xmm9
+; SSE2-NEXT:    pmullw %xmm0, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm7, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,128,128]
-; SSE2-NEXT:    pand %xmm1, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm9, %xmm6
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [256,256,256,256,256,256,256,256]
-; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm1, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512,512,512,512,512]
-; SSE2-NEXT:    pand %xmm1, %xmm8
+; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
 ; SSE2-NEXT:    pxor %xmm7, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [1024,1024,1024,1024,1024,1024,1024,1024]
-; SSE2-NEXT:    pand %xmm5, %xmm9
-; SSE2-NEXT:    pmullw %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2048,2048,2048,2048,2048,2048,2048,2048]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1024,1024,1024,1024,1024,1024,1024,1024]
 ; SSE2-NEXT:    pand %xmm1, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
-; SSE2-NEXT:    pxor %xmm9, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2048,2048,2048,2048,2048,2048,2048,2048]
+; SSE2-NEXT:    pand %xmm5, %xmm8
+; SSE2-NEXT:    pmullw %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm7, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; SSE2-NEXT:    pand %xmm1, %xmm7
+; SSE2-NEXT:    pmullw %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
 ; SSE2-NEXT:    psllw $8, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [8192,8192,8192,8192,8192,8192,8192,8192]
 ; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [8192,8192,8192,8192,8192,8192,8192,8192]
-; SSE2-NEXT:    pand %xmm1, %xmm9
-; SSE2-NEXT:    pmullw %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE2-NEXT:    pmullw %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm5
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pmullw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm5, %xmm0
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
+; SSE2-NEXT:    pmullw %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm7, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
@@ -1852,15 +1853,15 @@ define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    paddw %xmm0, %xmm0
+; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: clmulr_v8i16:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm6 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; SSE42-NEXT:    pshufb %xmm6, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSE42-NEXT:    pshufb %xmm5, %xmm2
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
 ; SSE42-NEXT:    pand %xmm4, %xmm0
@@ -1870,88 +1871,86 @@ define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE42-NEXT:    psrlw $4, %xmm2
 ; SSE42-NEXT:    pand %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; SSE42-NEXT:    movdqa %xmm0, %xmm5
-; SSE42-NEXT:    pshufb %xmm2, %xmm5
-; SSE42-NEXT:    por %xmm7, %xmm5
-; SSE42-NEXT:    pshufb %xmm6, %xmm1
-; SSE42-NEXT:    movdqa %xmm1, %xmm6
-; SSE42-NEXT:    psrlw $4, %xmm6
-; SSE42-NEXT:    pand %xmm4, %xmm6
+; SSE42-NEXT:    movdqa %xmm0, %xmm6
+; SSE42-NEXT:    pshufb %xmm2, %xmm6
+; SSE42-NEXT:    por %xmm7, %xmm6
+; SSE42-NEXT:    pshufb %xmm5, %xmm1
+; SSE42-NEXT:    movdqa %xmm1, %xmm7
+; SSE42-NEXT:    psrlw $4, %xmm7
+; SSE42-NEXT:    pand %xmm4, %xmm7
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pshufb %xmm6, %xmm2
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm6 = [2,2,2,2,2,2,2,2]
-; SSE42-NEXT:    pand %xmm2, %xmm6
-; SSE42-NEXT:    pmullw %xmm5, %xmm6
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1]
-; SSE42-NEXT:    pand %xmm2, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm6, %xmm8
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4]
-; SSE42-NEXT:    pand %xmm2, %xmm6
-; SSE42-NEXT:    pmullw %xmm5, %xmm6
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8]
+; SSE42-NEXT:    pshufb %xmm7, %xmm2
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm7 = [2,2,2,2,2,2,2,2]
+; SSE42-NEXT:    pand %xmm2, %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm7
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm9 = [1,1,1,1,1,1,1,1]
+; SSE42-NEXT:    pand %xmm2, %xmm9
+; SSE42-NEXT:    pmullw %xmm6, %xmm9
+; SSE42-NEXT:    pxor %xmm7, %xmm9
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm7 = [4,4,4,4,4,4,4,4]
 ; SSE42-NEXT:    pand %xmm2, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    pxor %xmm6, %xmm7
-; SSE42-NEXT:    pxor %xmm8, %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm7
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8]
+; SSE42-NEXT:    pand %xmm2, %xmm8
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    pxor %xmm7, %xmm8
+; SSE42-NEXT:    pxor %xmm9, %xmm8
 ; SSE42-NEXT:    pand %xmm4, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, %xmm6
-; SSE42-NEXT:    pshufb %xmm1, %xmm6
+; SSE42-NEXT:    movdqa %xmm3, %xmm7
+; SSE42-NEXT:    pshufb %xmm1, %xmm7
 ; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
-; SSE42-NEXT:    pand %xmm6, %xmm1
-; SSE42-NEXT:    pmullw %xmm5, %xmm1
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32]
-; SSE42-NEXT:    pand %xmm6, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm1, %xmm8
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64]
-; SSE42-NEXT:    pand %xmm6, %xmm9
-; SSE42-NEXT:    pmullw %xmm5, %xmm9
-; SSE42-NEXT:    pxor %xmm8, %xmm9
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm9 = [32,32,32,32,32,32,32,32]
+; SSE42-NEXT:    pand %xmm7, %xmm9
+; SSE42-NEXT:    pmullw %xmm6, %xmm9
+; SSE42-NEXT:    pxor %xmm1, %xmm9
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64]
+; SSE42-NEXT:    pand %xmm7, %xmm10
+; SSE42-NEXT:    pmullw %xmm6, %xmm10
+; SSE42-NEXT:    pxor %xmm9, %xmm10
 ; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128]
-; SSE42-NEXT:    pand %xmm6, %xmm1
-; SSE42-NEXT:    pmullw %xmm5, %xmm1
-; SSE42-NEXT:    pxor %xmm9, %xmm1
-; SSE42-NEXT:    pxor %xmm7, %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [256,256,256,256,256,256,256,256]
-; SSE42-NEXT:    pand %xmm2, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512,512,512,512,512]
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
+; SSE42-NEXT:    pxor %xmm10, %xmm1
+; SSE42-NEXT:    pxor %xmm8, %xmm1
+; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [256,256,256,256,256,256,256,256]
 ; SSE42-NEXT:    pand %xmm2, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm7, %xmm8
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [1024,1024,1024,1024,1024,1024,1024,1024]
-; SSE42-NEXT:    pand %xmm2, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    pxor %xmm8, %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    movdqa {{.*#+}} xmm9 = [512,512,512,512,512,512,512,512]
+; SSE42-NEXT:    pand %xmm2, %xmm9
+; SSE42-NEXT:    pmullw %xmm6, %xmm9
+; SSE42-NEXT:    pxor %xmm8, %xmm9
+; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [1024,1024,1024,1024,1024,1024,1024,1024]
+; SSE42-NEXT:    pand %xmm2, %xmm8
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    pxor %xmm9, %xmm8
 ; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE42-NEXT:    pmullw %xmm5, %xmm2
-; SSE42-NEXT:    pxor %xmm7, %xmm2
+; SSE42-NEXT:    pmullw %xmm6, %xmm2
+; SSE42-NEXT:    pxor %xmm8, %xmm2
 ; SSE42-NEXT:    pxor %xmm1, %xmm2
-; SSE42-NEXT:    psllw $8, %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [4096,4096,4096,4096,4096,4096,4096,4096]
-; SSE42-NEXT:    pand %xmm6, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
+; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [8192,8192,8192,8192,8192,8192,8192,8192]
-; SSE42-NEXT:    pand %xmm6, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm7, %xmm8
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [16384,16384,16384,16384,16384,16384,16384,16384]
-; SSE42-NEXT:    pand %xmm6, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    pxor %xmm8, %xmm7
-; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE42-NEXT:    pmullw %xmm5, %xmm6
-; SSE42-NEXT:    pxor %xmm7, %xmm6
-; SSE42-NEXT:    pxor %xmm2, %xmm6
-; SSE42-NEXT:    psrlw $8, %xmm6
-; SSE42-NEXT:    por %xmm1, %xmm6
-; SSE42-NEXT:    movdqa %xmm6, %xmm1
+; SSE42-NEXT:    pand %xmm7, %xmm8
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    pxor %xmm1, %xmm8
+; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [16384,16384,16384,16384,16384,16384,16384,16384]
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
+; SSE42-NEXT:    pxor %xmm8, %xmm1
+; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm7
+; SSE42-NEXT:    pxor %xmm1, %xmm7
+; SSE42-NEXT:    pxor %xmm2, %xmm7
+; SSE42-NEXT:    pshufb %xmm5, %xmm7
+; SSE42-NEXT:    movdqa %xmm7, %xmm1
 ; SSE42-NEXT:    pand %xmm4, %xmm1
 ; SSE42-NEXT:    pshufb %xmm1, %xmm3
-; SSE42-NEXT:    psrlw $4, %xmm6
-; SSE42-NEXT:    pand %xmm4, %xmm6
-; SSE42-NEXT:    pshufb %xmm6, %xmm0
+; SSE42-NEXT:    psrlw $4, %xmm7
+; SSE42-NEXT:    pand %xmm4, %xmm7
+; SSE42-NEXT:    pshufb %xmm7, %xmm0
 ; SSE42-NEXT:    por %xmm3, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -2042,99 +2041,73 @@ define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-LABEL: clmulr_v4i32:
 ; SSE2-NOPCLMUL:       # %bb.0:
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrld $24, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pslld $8, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [16711935,16711935,16711935,16711935]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psrld $8, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pslld $24, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psrld $4, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [252645135,252645135,252645135,252645135]
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psrld $2, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [858993459,858993459,858993459,858993459]
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm5
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [1431655765,1431655765,1431655765,1431655765]
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddd %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $24, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pslld $8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pslld $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm6, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $4, %xmm6
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pslld $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pslld $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    paddd %xmm6, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [2,2,2,2]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [4,4,4,4]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    psrld $1, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [1,1,1,1]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [4,4,4,4]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [8,8,8,8]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1,1,1,1]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -2143,18 +2116,28 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [16,16,16,16]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [2,2,2,2]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [8,8,8,8]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [32,32,32,32]
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16,16,16,16]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -2163,7 +2146,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [32,32,32,32]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -2173,7 +2156,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [128,128,128,128]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
@@ -2184,7 +2167,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [256,256,256,256]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [128,128,128,128]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -2193,7 +2176,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [256,256,256,256]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
@@ -2203,7 +2186,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1024,1024,1024,1024]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -2213,7 +2196,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [2048,2048,2048,2048]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [1024,1024,1024,1024]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm8
@@ -2224,7 +2207,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4096,4096,4096,4096]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [2048,2048,2048,2048]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -2233,7 +2216,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8192,8192,8192,8192]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4096,4096,4096,4096]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -2243,7 +2226,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16384,16384,16384,16384]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8192,8192,8192,8192]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -2253,7 +2236,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [32768,32768,32768,32768]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16384,16384,16384,16384]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -2263,7 +2246,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [65536,65536,65536,65536]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [32768,32768,32768,32768]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
@@ -2274,7 +2257,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [131072,131072,131072,131072]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [65536,65536,65536,65536]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -2283,7 +2266,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [262144,262144,262144,262144]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [131072,131072,131072,131072]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
@@ -2293,7 +2276,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [524288,524288,524288,524288]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [262144,262144,262144,262144]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -2303,7 +2286,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1048576,1048576,1048576,1048576]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [524288,524288,524288,524288]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
@@ -2313,7 +2296,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [2097152,2097152,2097152,2097152]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1048576,1048576,1048576,1048576]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -2323,7 +2306,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [4194304,4194304,4194304,4194304]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [2097152,2097152,2097152,2097152]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm8
@@ -2334,7 +2317,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8388608,8388608,8388608,8388608]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4194304,4194304,4194304,4194304]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -2343,7 +2326,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16777216,16777216,16777216,16777216]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8388608,8388608,8388608,8388608]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -2353,7 +2336,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [33554432,33554432,33554432,33554432]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16777216,16777216,16777216,16777216]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -2363,7 +2346,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [67108864,67108864,67108864,67108864]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [33554432,33554432,33554432,33554432]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -2373,7 +2356,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [134217728,134217728,134217728,134217728]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [67108864,67108864,67108864,67108864]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -2383,7 +2366,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [268435456,268435456,268435456,268435456]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [134217728,134217728,134217728,134217728]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -2393,7 +2376,7 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [536870912,536870912,536870912,536870912]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [268435456,268435456,268435456,268435456]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
@@ -2404,52 +2387,58 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [536870912,536870912,536870912,536870912]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pslld $8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pslld $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrld $8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddd %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    retq
 ;
@@ -2662,578 +2651,635 @@ define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NOPCLMUL-LABEL: clmulr_v2i64:
 ; SSE2-NOPCLMUL:       # %bb.0:
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [65280,65280]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    subq $248, %rsp
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [1085102592571150095,1085102592571150095]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm2
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [3689348814741910323,3689348814741910323]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [6148914691236517205,6148914691236517205]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm0, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm13
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm12
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm12
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm12
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm12
-; SSE2-NOPCLMUL-NEXT:    por %xmm12, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm11, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm8, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm4
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [4,4]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm9, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [1,1]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [2,2]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [8,8]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [16,16]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [512,512]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [4096,4096]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [8192,8192]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [65536,65536]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [131072,131072]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [262144,262144]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [524288,524288]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [1048576,1048576]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [2097152,2097152]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [4194304,4194304]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [8388608,8388608]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [16777216,16777216]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [33554432,33554432]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm13, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [16777216,16777216]
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [67108864,67108864]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [134217728,134217728]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm5
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [268435456,268435456]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [33554432,33554432]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [67108864,67108864]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm13, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [536870912,536870912]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [134217728,134217728]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [32,32]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm10
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [64,64]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [128,128]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [256,256]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [1024,1024]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [2048,2048]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [16384,16384]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [32768,32768]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [536870912,536870912]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [2,2]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $33, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [4,4]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [8,8]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $35, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [16,16]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [32,32]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [64,64]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [128,128]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [512,512]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [1024,1024]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [2048,2048]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [4096,4096]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [8192,8192]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [16384,16384]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm12, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm13
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm13
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [131072,131072]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [262144,262144]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm13, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [524288,524288]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [1048576,1048576]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [2097152,2097152]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [4194304,4194304]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [8388608,8388608]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm15
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm15, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm13
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm13
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm13
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm14, %xmm13
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm13, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm12
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm12
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm12
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm12, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [1085102592571150095,1085102592571150095]
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [3689348814741910323,3689348814741910323]
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [6148914691236517205,6148914691236517205]
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    addq $248, %rsp
 ; SSE2-NOPCLMUL-NEXT:    retq
 ;
 ; SSE-PCLMUL-LABEL: clmulr_v2i64:
@@ -3282,13 +3328,12 @@ define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: clmulh_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllw $4, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm4
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $2, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
@@ -3305,12 +3350,11 @@ define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    psllw $4, %xmm7
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm7, %xmm5
-; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
 ; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    psllw $4, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-NEXT:    psrlw $2, %xmm1
@@ -3412,11 +3456,11 @@ define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    packuswb %xmm8, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psllw $4, %xmm1
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
@@ -3637,24 +3681,24 @@ define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    psrlw $4, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [3855,3855,3855,3855,3855,3855,3855,3855]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    psllw $4, %xmm0
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $2, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    pand %xmm3, %xmm4
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    psllw $2, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    psrlw $1, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [21845,21845,21845,21845,21845,21845,21845,21845]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NEXT:    pand %xmm4, %xmm5
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    paddw %xmm0, %xmm0
+; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    psrlw $8, %xmm5
@@ -3673,70 +3717,72 @@ define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    psllw $2, %xmm5
 ; SSE2-NEXT:    por %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    paddw %xmm1, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2,2,2,2,2,2,2,2]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4]
 ; SSE2-NEXT:    pand %xmm1, %xmm6
 ; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm5
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1]
-; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm1, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4,4,4,4,4,4,4,4]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2,2,2,2,2,2,2,2]
 ; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
 ; SSE2-NEXT:    pxor %xmm7, %xmm8
-; SSE2-NEXT:    pxor %xmm6, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8]
-; SSE2-NEXT:    pand %xmm1, %xmm6
-; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8]
 ; SSE2-NEXT:    pand %xmm5, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
 ; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    pand %xmm1, %xmm6
 ; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [64,64,64,64,64,64,64,64]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32]
 ; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
 ; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64]
+; SSE2-NEXT:    pand %xmm1, %xmm9
+; SSE2-NEXT:    pmullw %xmm0, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm7, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,128,128]
-; SSE2-NEXT:    pand %xmm1, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NEXT:    pmullw %xmm0, %xmm6
-; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm9, %xmm6
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [256,256,256,256,256,256,256,256]
-; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm1, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512,512,512,512,512]
-; SSE2-NEXT:    pand %xmm1, %xmm8
+; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
 ; SSE2-NEXT:    pxor %xmm7, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [1024,1024,1024,1024,1024,1024,1024,1024]
-; SSE2-NEXT:    pand %xmm5, %xmm9
-; SSE2-NEXT:    pmullw %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2048,2048,2048,2048,2048,2048,2048,2048]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1024,1024,1024,1024,1024,1024,1024,1024]
 ; SSE2-NEXT:    pand %xmm1, %xmm7
 ; SSE2-NEXT:    pmullw %xmm0, %xmm7
-; SSE2-NEXT:    pxor %xmm9, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2048,2048,2048,2048,2048,2048,2048,2048]
+; SSE2-NEXT:    pand %xmm5, %xmm8
+; SSE2-NEXT:    pmullw %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm7, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; SSE2-NEXT:    pand %xmm1, %xmm7
+; SSE2-NEXT:    pmullw %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
 ; SSE2-NEXT:    psllw $8, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [8192,8192,8192,8192,8192,8192,8192,8192]
 ; SSE2-NEXT:    pand %xmm5, %xmm8
 ; SSE2-NEXT:    pmullw %xmm0, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [8192,8192,8192,8192,8192,8192,8192,8192]
-; SSE2-NEXT:    pand %xmm1, %xmm9
-; SSE2-NEXT:    pmullw %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE2-NEXT:    pmullw %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm5
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pmullw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm5, %xmm0
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
+; SSE2-NEXT:    pmullw %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm7, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
@@ -3756,7 +3802,7 @@ define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    paddw %xmm0, %xmm0
+; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    retq
@@ -3764,8 +3810,8 @@ define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE42-LABEL: clmulh_v8i16:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm6 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; SSE42-NEXT:    pshufb %xmm6, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSE42-NEXT:    pshufb %xmm5, %xmm2
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
 ; SSE42-NEXT:    pand %xmm4, %xmm0
@@ -3775,88 +3821,86 @@ define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE42-NEXT:    psrlw $4, %xmm2
 ; SSE42-NEXT:    pand %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; SSE42-NEXT:    movdqa %xmm0, %xmm5
-; SSE42-NEXT:    pshufb %xmm2, %xmm5
-; SSE42-NEXT:    por %xmm7, %xmm5
-; SSE42-NEXT:    pshufb %xmm6, %xmm1
-; SSE42-NEXT:    movdqa %xmm1, %xmm6
-; SSE42-NEXT:    psrlw $4, %xmm6
-; SSE42-NEXT:    pand %xmm4, %xmm6
+; SSE42-NEXT:    movdqa %xmm0, %xmm6
+; SSE42-NEXT:    pshufb %xmm2, %xmm6
+; SSE42-NEXT:    por %xmm7, %xmm6
+; SSE42-NEXT:    pshufb %xmm5, %xmm1
+; SSE42-NEXT:    movdqa %xmm1, %xmm7
+; SSE42-NEXT:    psrlw $4, %xmm7
+; SSE42-NEXT:    pand %xmm4, %xmm7
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pshufb %xmm6, %xmm2
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm6 = [2,2,2,2,2,2,2,2]
-; SSE42-NEXT:    pand %xmm2, %xmm6
-; SSE42-NEXT:    pmullw %xmm5, %xmm6
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1]
-; SSE42-NEXT:    pand %xmm2, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm6, %xmm8
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4]
-; SSE42-NEXT:    pand %xmm2, %xmm6
-; SSE42-NEXT:    pmullw %xmm5, %xmm6
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8]
+; SSE42-NEXT:    pshufb %xmm7, %xmm2
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm7 = [2,2,2,2,2,2,2,2]
+; SSE42-NEXT:    pand %xmm2, %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm7
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm9 = [1,1,1,1,1,1,1,1]
+; SSE42-NEXT:    pand %xmm2, %xmm9
+; SSE42-NEXT:    pmullw %xmm6, %xmm9
+; SSE42-NEXT:    pxor %xmm7, %xmm9
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm7 = [4,4,4,4,4,4,4,4]
 ; SSE42-NEXT:    pand %xmm2, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    pxor %xmm6, %xmm7
-; SSE42-NEXT:    pxor %xmm8, %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm7
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8]
+; SSE42-NEXT:    pand %xmm2, %xmm8
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    pxor %xmm7, %xmm8
+; SSE42-NEXT:    pxor %xmm9, %xmm8
 ; SSE42-NEXT:    pand %xmm4, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, %xmm6
-; SSE42-NEXT:    pshufb %xmm1, %xmm6
+; SSE42-NEXT:    movdqa %xmm3, %xmm7
+; SSE42-NEXT:    pshufb %xmm1, %xmm7
 ; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
-; SSE42-NEXT:    pand %xmm6, %xmm1
-; SSE42-NEXT:    pmullw %xmm5, %xmm1
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32]
-; SSE42-NEXT:    pand %xmm6, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm1, %xmm8
-; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64]
-; SSE42-NEXT:    pand %xmm6, %xmm9
-; SSE42-NEXT:    pmullw %xmm5, %xmm9
-; SSE42-NEXT:    pxor %xmm8, %xmm9
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm9 = [32,32,32,32,32,32,32,32]
+; SSE42-NEXT:    pand %xmm7, %xmm9
+; SSE42-NEXT:    pmullw %xmm6, %xmm9
+; SSE42-NEXT:    pxor %xmm1, %xmm9
+; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64]
+; SSE42-NEXT:    pand %xmm7, %xmm10
+; SSE42-NEXT:    pmullw %xmm6, %xmm10
+; SSE42-NEXT:    pxor %xmm9, %xmm10
 ; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128]
-; SSE42-NEXT:    pand %xmm6, %xmm1
-; SSE42-NEXT:    pmullw %xmm5, %xmm1
-; SSE42-NEXT:    pxor %xmm9, %xmm1
-; SSE42-NEXT:    pxor %xmm7, %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [256,256,256,256,256,256,256,256]
-; SSE42-NEXT:    pand %xmm2, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512,512,512,512,512]
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
+; SSE42-NEXT:    pxor %xmm10, %xmm1
+; SSE42-NEXT:    pxor %xmm8, %xmm1
+; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [256,256,256,256,256,256,256,256]
 ; SSE42-NEXT:    pand %xmm2, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm7, %xmm8
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [1024,1024,1024,1024,1024,1024,1024,1024]
-; SSE42-NEXT:    pand %xmm2, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    pxor %xmm8, %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    movdqa {{.*#+}} xmm9 = [512,512,512,512,512,512,512,512]
+; SSE42-NEXT:    pand %xmm2, %xmm9
+; SSE42-NEXT:    pmullw %xmm6, %xmm9
+; SSE42-NEXT:    pxor %xmm8, %xmm9
+; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [1024,1024,1024,1024,1024,1024,1024,1024]
+; SSE42-NEXT:    pand %xmm2, %xmm8
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    pxor %xmm9, %xmm8
 ; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE42-NEXT:    pmullw %xmm5, %xmm2
-; SSE42-NEXT:    pxor %xmm7, %xmm2
+; SSE42-NEXT:    pmullw %xmm6, %xmm2
+; SSE42-NEXT:    pxor %xmm8, %xmm2
 ; SSE42-NEXT:    pxor %xmm1, %xmm2
-; SSE42-NEXT:    psllw $8, %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [4096,4096,4096,4096,4096,4096,4096,4096]
-; SSE42-NEXT:    pand %xmm6, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
+; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [4096,4096,4096,4096,4096,4096,4096,4096]
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [8192,8192,8192,8192,8192,8192,8192,8192]
-; SSE42-NEXT:    pand %xmm6, %xmm8
-; SSE42-NEXT:    pmullw %xmm5, %xmm8
-; SSE42-NEXT:    pxor %xmm7, %xmm8
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [16384,16384,16384,16384,16384,16384,16384,16384]
-; SSE42-NEXT:    pand %xmm6, %xmm7
-; SSE42-NEXT:    pmullw %xmm5, %xmm7
-; SSE42-NEXT:    pxor %xmm8, %xmm7
-; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE42-NEXT:    pmullw %xmm5, %xmm6
-; SSE42-NEXT:    pxor %xmm7, %xmm6
-; SSE42-NEXT:    pxor %xmm2, %xmm6
-; SSE42-NEXT:    psrlw $8, %xmm6
-; SSE42-NEXT:    por %xmm1, %xmm6
-; SSE42-NEXT:    movdqa %xmm6, %xmm1
+; SSE42-NEXT:    pand %xmm7, %xmm8
+; SSE42-NEXT:    pmullw %xmm6, %xmm8
+; SSE42-NEXT:    pxor %xmm1, %xmm8
+; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [16384,16384,16384,16384,16384,16384,16384,16384]
+; SSE42-NEXT:    pand %xmm7, %xmm1
+; SSE42-NEXT:    pmullw %xmm6, %xmm1
+; SSE42-NEXT:    pxor %xmm8, %xmm1
+; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
+; SSE42-NEXT:    pmullw %xmm6, %xmm7
+; SSE42-NEXT:    pxor %xmm1, %xmm7
+; SSE42-NEXT:    pxor %xmm2, %xmm7
+; SSE42-NEXT:    pshufb %xmm5, %xmm7
+; SSE42-NEXT:    movdqa %xmm7, %xmm1
 ; SSE42-NEXT:    pand %xmm4, %xmm1
 ; SSE42-NEXT:    pshufb %xmm1, %xmm3
-; SSE42-NEXT:    psrlw $4, %xmm6
-; SSE42-NEXT:    pand %xmm4, %xmm6
-; SSE42-NEXT:    pshufb %xmm6, %xmm0
+; SSE42-NEXT:    psrlw $4, %xmm7
+; SSE42-NEXT:    pand %xmm4, %xmm7
+; SSE42-NEXT:    pshufb %xmm7, %xmm0
 ; SSE42-NEXT:    por %xmm3, %xmm0
 ; SSE42-NEXT:    psrlw $1, %xmm0
 ; SSE42-NEXT:    retq
@@ -3948,99 +3992,73 @@ define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-LABEL: clmulh_v4i32:
 ; SSE2-NOPCLMUL:       # %bb.0:
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrld $24, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pslld $8, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [16711935,16711935,16711935,16711935]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psrld $8, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pslld $24, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psrld $4, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [252645135,252645135,252645135,252645135]
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psrld $2, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [858993459,858993459,858993459,858993459]
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm5
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [1431655765,1431655765,1431655765,1431655765]
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddd %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $24, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pslld $8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pslld $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm6, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $4, %xmm6
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pslld $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrld $2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pslld $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    paddd %xmm6, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [2,2,2,2]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [4,4,4,4]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    psrld $1, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [1,1,1,1]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [4,4,4,4]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [8,8,8,8]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1,1,1,1]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -4049,18 +4067,28 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [16,16,16,16]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [2,2,2,2]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [8,8,8,8]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [32,32,32,32]
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16,16,16,16]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -4069,7 +4097,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [32,32,32,32]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -4079,7 +4107,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [128,128,128,128]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
@@ -4090,7 +4118,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [256,256,256,256]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [128,128,128,128]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -4099,7 +4127,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [256,256,256,256]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
@@ -4109,7 +4137,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1024,1024,1024,1024]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [512,512,512,512]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -4119,7 +4147,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [2048,2048,2048,2048]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [1024,1024,1024,1024]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm8
@@ -4130,7 +4158,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4096,4096,4096,4096]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [2048,2048,2048,2048]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -4139,7 +4167,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8192,8192,8192,8192]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4096,4096,4096,4096]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -4149,7 +4177,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16384,16384,16384,16384]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8192,8192,8192,8192]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -4159,7 +4187,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [32768,32768,32768,32768]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16384,16384,16384,16384]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -4169,7 +4197,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [65536,65536,65536,65536]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [32768,32768,32768,32768]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
@@ -4180,7 +4208,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [131072,131072,131072,131072]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [65536,65536,65536,65536]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -4189,7 +4217,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [262144,262144,262144,262144]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [131072,131072,131072,131072]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
@@ -4199,7 +4227,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [524288,524288,524288,524288]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [262144,262144,262144,262144]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -4209,7 +4237,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1048576,1048576,1048576,1048576]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [524288,524288,524288,524288]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
@@ -4219,7 +4247,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [2097152,2097152,2097152,2097152]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [1048576,1048576,1048576,1048576]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
@@ -4229,7 +4257,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [4194304,4194304,4194304,4194304]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [2097152,2097152,2097152,2097152]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm8
@@ -4240,7 +4268,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8388608,8388608,8388608,8388608]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4194304,4194304,4194304,4194304]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -4249,7 +4277,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16777216,16777216,16777216,16777216]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [8388608,8388608,8388608,8388608]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -4259,7 +4287,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [33554432,33554432,33554432,33554432]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [16777216,16777216,16777216,16777216]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -4269,7 +4297,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [67108864,67108864,67108864,67108864]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [33554432,33554432,33554432,33554432]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -4279,7 +4307,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [134217728,134217728,134217728,134217728]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [67108864,67108864,67108864,67108864]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm10
@@ -4289,7 +4317,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [268435456,268435456,268435456,268435456]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [134217728,134217728,134217728,134217728]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm11
@@ -4299,7 +4327,7 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [536870912,536870912,536870912,536870912]
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [268435456,268435456,268435456,268435456]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
@@ -4310,52 +4338,58 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm9
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [536870912,536870912,536870912,536870912]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm8
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NOPCLMUL-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pslld $8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pslld $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrld $8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pslld $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrld $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddd %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    psrld $1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    retq
@@ -4503,579 +4537,636 @@ define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NOPCLMUL-LABEL: clmulh_v2i64:
 ; SSE2-NOPCLMUL:       # %bb.0:
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [65280,65280]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    subq $248, %rsp
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [1085102592571150095,1085102592571150095]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm2
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [3689348814741910323,3689348814741910323]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [6148914691236517205,6148914691236517205]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm0, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm8, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm4
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [4,4]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm13
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm12
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm12
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm12
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm12
-; SSE2-NOPCLMUL-NEXT:    por %xmm12, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm11, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm9, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [1,1]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [2,2]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [8,8]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [16,16]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [512,512]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [4096,4096]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [8192,8192]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [65536,65536]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [131072,131072]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [262144,262144]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [524288,524288]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [1048576,1048576]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [2097152,2097152]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [4194304,4194304]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [8388608,8388608]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [16777216,16777216]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [33554432,33554432]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [67108864,67108864]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [134217728,134217728]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm5
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm10
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [268435456,268435456]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [536870912,536870912]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm13, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm7
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [16777216,16777216]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [33554432,33554432]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [67108864,67108864]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm13, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm6
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [134217728,134217728]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [32,32]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm10
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [64,64]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm9
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [128,128]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [256,256]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [1024,1024]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [2048,2048]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [16384,16384]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [32768,32768]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [536870912,536870912]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [2,2]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $33, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [4,4]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [8,8]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $35, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [16,16]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [32,32]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [64,64]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [128,128]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [512,512]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [1024,1024]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [2048,2048]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [4096,4096]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [8192,8192]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [16384,16384]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm5
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm12, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm13
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm13
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [131072,131072]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [262144,262144]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm13, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [524288,524288]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [1048576,1048576]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [2097152,2097152]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm7
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [4194304,4194304]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [8388608,8388608]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm15
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm15, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm13
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm13
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm13
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm14, %xmm13
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm13, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm12
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm12
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm12
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm12, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm0
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [1085102592571150095,1085102592571150095]
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [3689348814741910323,3689348814741910323]
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [6148914691236517205,6148914691236517205]
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    addq $248, %rsp
 ; SSE2-NOPCLMUL-NEXT:    retq
 ;
 ; SSE-PCLMUL-LABEL: clmulh_v2i64:
@@ -5710,632 +5801,569 @@ define void @commutative_clmul_v2i64(<2 x i64> %x, <2 x i64> %y, ptr %p0, ptr %p
 define void @commutative_clmulh_v2i64(<2 x i64> %x, <2 x i64> %y, ptr %p0, ptr %p1) nounwind {
 ; SSE2-NOPCLMUL-LABEL: commutative_clmulh_v2i64:
 ; SSE2-NOPCLMUL:       # %bb.0:
-; SSE2-NOPCLMUL-NEXT:    subq $360, %rsp # imm = 0x168
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    subq $248, %rsp
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [65280,65280]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [1085102592571150095,1085102592571150095]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [3689348814741910323,3689348814741910323]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [6148914691236517205,6148914691236517205]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm10
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm10, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm0
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm8, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm3
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm3, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [4,4]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm7
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm7
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [2,2]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm15
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm11, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm15
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm15
-; SSE2-NOPCLMUL-NEXT:    por %xmm15, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm14
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm14, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [4096,4096]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm13, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [16384,16384]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [8,8]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [16,16]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [131072,131072]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [262144,262144]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [1048576,1048576]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [512,512]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [4096,4096]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [8192,8192]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [2097152,2097152]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [4194304,4194304]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [8388608,8388608]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [65536,65536]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [131072,131072]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [16777216,16777216]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [33554432,33554432]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [67108864,67108864]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [134217728,134217728]
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [262144,262144]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [524288,524288]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [1048576,1048576]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [2097152,2097152]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [4194304,4194304]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [268435456,268435456]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [8388608,8388608]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [536870912,536870912]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [16777216,16777216]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [33554432,33554432]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [67108864,67108864]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [134217728,134217728]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [268435456,268435456]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [536870912,536870912]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psrlq $35, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [16,16]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm13
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psrlq $33, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [4,4]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [64,64]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [32,32]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm10
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [64,64]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm9
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [128,128]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [256,256]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm15
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [1024,1024]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm13
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [2048,2048]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, (%rsp) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [2,2]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [8,8]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [32,32]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [128,128]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [512,512]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [2048,2048]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [8192,8192]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [524288,524288]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm13
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [16384,16384]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [32768,32768]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm12
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm13
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm15
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm6, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm12
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm12, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm13
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm13
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm13, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm13, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm15
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
@@ -6346,51 +6374,65 @@ define void @commutative_clmulh_v2i64(<2 x i64> %x, <2 x i64> %y, ptr %p0, ptr %
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [1085102592571150095,1085102592571150095]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [6148914691236517205,6148914691236517205]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm14, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rdi)
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rsi)
-; SSE2-NOPCLMUL-NEXT:    addq $360, %rsp # imm = 0x168
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rsi)
+; SSE2-NOPCLMUL-NEXT:    addq $248, %rsp
 ; SSE2-NOPCLMUL-NEXT:    retq
 ;
 ; SSE-PCLMUL-LABEL: commutative_clmulh_v2i64:
@@ -6436,632 +6478,569 @@ define void @commutative_clmulh_v2i64(<2 x i64> %x, <2 x i64> %y, ptr %p0, ptr %
 define void @commutative_clmulr_v2i64(<2 x i64> %x, <2 x i64> %y, ptr %p0, ptr %p1) nounwind {
 ; SSE2-NOPCLMUL-LABEL: commutative_clmulr_v2i64:
 ; SSE2-NOPCLMUL:       # %bb.0:
-; SSE2-NOPCLMUL-NEXT:    subq $360, %rsp # imm = 0x168
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    subq $248, %rsp
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm8 = [65280,65280]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [1085102592571150095,1085102592571150095]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [3689348814741910323,3689348814741910323]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm10
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [6148914691236517205,6148914691236517205]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm10
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm10, %xmm10
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm10
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm0
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm8
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm8
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm8, %xmm8
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm3
 ; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm3
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm3, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [4,4]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm7
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm7
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm7
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [2,2]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm15
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm11, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm15
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm15
-; SSE2-NOPCLMUL-NEXT:    por %xmm15, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm14
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm14, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [8,8]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [16,16]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm11, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [4096,4096]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm13, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [16384,16384]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [512,512]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [4096,4096]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [8192,8192]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [131072,131072]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [262144,262144]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [1048576,1048576]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pand %xmm12, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm6
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [2097152,2097152]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [4194304,4194304]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm6, %xmm7
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    por %xmm7, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    psllq $56, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pand %xmm8, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $40, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [8388608,8388608]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm3
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [65536,65536]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [131072,131072]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [16777216,16777216]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm5, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [33554432,33554432]
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [262144,262144]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [524288,524288]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [1048576,1048576]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [2097152,2097152]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [4194304,4194304]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
-; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm5
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [67108864,67108864]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm3, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm3
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
-; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [8388608,8388608]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [16777216,16777216]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [33554432,33554432]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [67108864,67108864]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [134217728,134217728]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm3, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [268435456,268435456]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [268435456,268435456]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm3, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [536870912,536870912]
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand %xmm3, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm11, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm5, %xmm11
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pand %xmm7, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [536870912,536870912]
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psrlq $35, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [16,16]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm8, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm1, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm9, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    por %xmm6, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm13
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm0, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm10, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [32,32]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm10
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [64,64]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm9
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [128,128]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm11
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [256,256]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [1024,1024]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [2048,2048]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm14 = [16384,16384]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [32768,32768]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    psrlq $33, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm12 = [4,4]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm7 = [64,64]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm13 = [1024,1024]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm13
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand %xmm15, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, (%rsp) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
-; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [2,2]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm4 = [8,8]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [32,32]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [128,128]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm15 = [512,512]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm11 = [2048,2048]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm11
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm0 = [8192,8192]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm5 = [524288,524288]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pand %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psrlq $39, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm12
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm9
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm13
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm4
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm11
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm3
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm12
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm5
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm13
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm15
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm10, %xmm14
+; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psrlq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pand %xmm6, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm6
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm4
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm7
 ; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm10
-; SSE2-NOPCLMUL-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm10, %xmm8
-; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm13
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm15
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rsp) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm14
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm8, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pmuludq %xmm0, %xmm8
+; SSE2-NOPCLMUL-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm6, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm7
+; SSE2-NOPCLMUL-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm10
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm11
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm12
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm11, %xmm12
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm12, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm13
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm13
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm9
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm9
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm13, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm9, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm15, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm10 = [16711680,16711680]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $24, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm9 = [4278190080,4278190080]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm8
-; SSE2-NOPCLMUL-NEXT:    psllq $8, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por %xmm2, %xmm8
-; SSE2-NOPCLMUL-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm4
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm6
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm6, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm11, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm13, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm7
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm7, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm15
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm15
-; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm15, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
@@ -7072,50 +7051,64 @@ define void @commutative_clmulr_v2i64(<2 x i64> %x, <2 x i64> %y, ptr %p0, ptr %
 ; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm0
 ; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
 ; SSE2-NOPCLMUL-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
 ; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $40, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm1
-; SSE2-NOPCLMUL-NEXT:    psrlq $56, %xmm1
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm1
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $24, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm10, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $8, %xmm14
-; SSE2-NOPCLMUL-NEXT:    pand %xmm9, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm8, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $4, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [1085102592571150095,1085102592571150095]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    psllq $4, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $2, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    psllq $2, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NOPCLMUL-NEXT:    psrlq $1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm1 = [6148914691236517205,6148914691236517205]
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm0
-; SSE2-NOPCLMUL-NEXT:    pand %xmm1, %xmm14
-; SSE2-NOPCLMUL-NEXT:    paddq %xmm14, %xmm14
-; SSE2-NOPCLMUL-NEXT:    por %xmm0, %xmm14
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rdi)
-; SSE2-NOPCLMUL-NEXT:    movdqa %xmm14, (%rsi)
-; SSE2-NOPCLMUL-NEXT:    addq $360, %rsp # imm = 0x168
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm2
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm5
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm2, %xmm5
+; SSE2-NOPCLMUL-NEXT:    psllq $32, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NOPCLMUL-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NOPCLMUL-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NOPCLMUL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NOPCLMUL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NOPCLMUL-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $4, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $4, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    psllw $2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NOPCLMUL-NEXT:    psrlw $1, %xmm1
+; SSE2-NOPCLMUL-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm1
+; SSE2-NOPCLMUL-NEXT:    pand %xmm2, %xmm0
+; SSE2-NOPCLMUL-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NOPCLMUL-NEXT:    por %xmm1, %xmm0
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NOPCLMUL-NEXT:    movdqa %xmm0, (%rsi)
+; SSE2-NOPCLMUL-NEXT:    addq $248, %rsp
 ; SSE2-NOPCLMUL-NEXT:    retq
 ;
 ; SSE-PCLMUL-LABEL: commutative_clmulr_v2i64:

>From 8ec437b038bea2c2188af4bba8c76dd8001fa42d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 2 Mar 2026 15:40:01 -0800
Subject: [PATCH 2/3] [RISCV] Promote i8/i16/i32 scalable vector CLMUL  to i64
 CLMUL with Zvbc.

This handles the simple case where we can widen to i64 vector
without splitting. More work will be done in follow ups.

Stacked on #184257
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp  |   26 +-
 llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll  | 7162 +++++++++++-------
 llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll | 3140 +++++---
 3 files changed, 6789 insertions(+), 3539 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a8542be937a87..2bdffad2ded26 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1116,8 +1116,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         }
       }
 
-      if (Subtarget.hasStdExtZvbc() && VT.getVectorElementType() == MVT::i64)
-        setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
+      if (Subtarget.hasStdExtZvbc() && Subtarget.hasVInstructionsI64()) {
+        if (VT.getVectorElementType() == MVT::i64)
+          setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
+        else {
+          // Promote to i64 if the lmul is small enough.
+          // FIXME: Split if necessary to widen.
+          // FIXME: Promote clmulh directly without legalizing to clmul first.
+          MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
+          if (isTypeLegal(I64VecVT))
+            setOperationAction(ISD::CLMUL, VT, Custom);
+        }
+      }
 
       setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
     }
@@ -8920,6 +8930,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       return lowerToScalableOp(Op, DAG);
     assert(Op.getOpcode() != ISD::CTTZ);
     return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
+  case ISD::CLMUL: {
+    assert(Op.getValueType().isScalableVector() && Subtarget.hasStdExtZvbc() &&
+           "Unexpected custom legalisation");
+    // Promote to i64 vector.
+    MVT VT = Op.getSimpleValueType();
+    MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
+    SDLoc DL(Op);
+    SDValue Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, I64VecVT, Op.getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, I64VecVT, Op.getOperand(1));
+    SDValue CLMUL = DAG.getNode(ISD::CLMUL, DL, I64VecVT, Op0, Op1);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, CLMUL);
+  }
   case ISD::FCOPYSIGN:
     if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16)
       return lowerFCOPYSIGN(Op, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
index 3bcbc9a72c5cd..945419f895c2b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
@@ -5,74 +5,199 @@
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC
 
 define <vscale x 1 x i8> @clmul_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv1i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmul_nxv1i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv1i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v9, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv1i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v9, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.clmul.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb)
   ret <vscale x 1 x i8> %v
 }
 
 define <vscale x 1 x i8> @clmul_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmul_nxv1i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmul_nxv1i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv1i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v9
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv1i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v9
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i8> @llvm.clmul.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb)
@@ -80,74 +205,199 @@ define <vscale x 1 x i8> @clmul_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwind
 }
 
 define <vscale x 2 x i8> @clmul_nxv2i8_vv(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv2i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmul_nxv2i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv2i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv2i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.clmul.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb)
   ret <vscale x 2 x i8> %v
 }
 
 define <vscale x 2 x i8> @clmul_nxv2i8_vx(<vscale x 2 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmul_nxv2i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmul_nxv2i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv2i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v12
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv2i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v12
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i8> @llvm.clmul.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb)
@@ -155,149 +405,399 @@ define <vscale x 2 x i8> @clmul_nxv2i8_vx(<vscale x 2 x i8> %va, i8 %b) nounwind
 }
 
 define <vscale x 4 x i8> @clmul_nxv4i8_vv(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv4i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb)
-  ret <vscale x 4 x i8> %v
-}
-
-define <vscale x 4 x i8> @clmul_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmul_nxv4i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
-  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
-  %v = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb)
-  ret <vscale x 4 x i8> %v
-}
-
-define <vscale x 8 x i8> @clmul_nxv8i8_vv(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv8i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 8 x i8> @llvm.clmul.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb)
-  ret <vscale x 8 x i8> %v
-}
-
-define <vscale x 8 x i8> @clmul_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmul_nxv8i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmul_nxv4i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv4i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v12, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv4i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v12, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @clmul_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmul_nxv4i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv4i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v16
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv4i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v16
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 8 x i8> @clmul_nxv8i8_vv(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv8i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv8i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv8i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v16, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v24, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv8i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v16, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v24, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.clmul.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @clmul_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmul_nxv8i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv8i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv8i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v24
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv8i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v24
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i8> @llvm.clmul.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb)
@@ -530,2082 +1030,1305 @@ define <vscale x 64 x i8> @clmul_nxv64i8_vx(<vscale x 64 x i8> %va, i8 %b) nounw
 }
 
 define <vscale x 1 x i16> @clmul_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv1i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
-  ret <vscale x 1 x i16> %v
-}
-
-define <vscale x 1 x i16> @clmul_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv1i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
-  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
+; RV32V-LABEL: clmul_nxv1i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv1i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf4 v9, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv1i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf4 v9, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
   ret <vscale x 1 x i16> %v
 }
 
-define <vscale x 2 x i16> @clmul_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv2i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
-  ret <vscale x 2 x i16> %v
-}
-
-define <vscale x 2 x i16> @clmul_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv2i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
-  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
-  ret <vscale x 2 x i16> %v
-}
-
-define <vscale x 4 x i16> @clmul_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv4i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
-  ret <vscale x 4 x i16> %v
+define <vscale x 1 x i16> @clmul_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv1i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv1i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v9
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv1i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v9
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb)
+  ret <vscale x 1 x i16> %v
 }
 
-define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv4i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
-  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
-  ret <vscale x 4 x i16> %v
-}
-
-define <vscale x 8 x i16> @clmul_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv8i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vand.vi v12, v10, 2
-; CHECK-NEXT:    vand.vi v14, v10, 1
-; CHECK-NEXT:    vmul.vv v12, v8, v12
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v14, v12
-; CHECK-NEXT:    vand.vi v14, v10, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vi v14, v10, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vxor.vv v8, v12, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
-  ret <vscale x 8 x i16> %v
-}
-
-define <vscale x 8 x i16> @clmul_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv8i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v12, v10, 2
-; CHECK-NEXT:    vand.vi v14, v10, 1
-; CHECK-NEXT:    vmul.vv v12, v8, v12
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v14, v12
-; CHECK-NEXT:    vand.vi v14, v10, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vi v14, v10, 8
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vxor.vv v8, v12, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
-  ret <vscale x 8 x i16> %v
+define <vscale x 2 x i16> @clmul_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv2i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv2i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv2i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
+  ret <vscale x 2 x i16> %v
 }
 
-define <vscale x 16 x i16> @clmul_nxv16i16_vv(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv16i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vand.vi v16, v12, 2
-; CHECK-NEXT:    vand.vi v20, v12, 1
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v12, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v12, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vxor.vv v8, v16, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
-  ret <vscale x 16 x i16> %v
-}
-
-define <vscale x 16 x i16> @clmul_nxv16i16_vx(<vscale x 16 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv16i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v16, v12, 2
-; CHECK-NEXT:    vand.vi v20, v12, 1
-; CHECK-NEXT:    vmul.vv v16, v8, v16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v12, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v12, 8
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vxor.vv v8, v16, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
-  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
-  ret <vscale x 16 x i16> %v
+define <vscale x 2 x i16> @clmul_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv2i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv2i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v12
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv2i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v12
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb)
+  ret <vscale x 2 x i16> %v
 }
 
-define <vscale x 32 x i16> @clmul_nxv32i16_vv(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv32i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vand.vi v24, v16, 2
-; CHECK-NEXT:    vand.vi v0, v16, 1
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v16, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v16, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
-  ret <vscale x 32 x i16> %v
-}
-
-define <vscale x 32 x i16> @clmul_nxv32i16_vx(<vscale x 32 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmul_nxv32i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v24, v16, 2
-; CHECK-NEXT:    vand.vi v0, v16, 1
-; CHECK-NEXT:    vmul.vv v24, v8, v24
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v0, v24
-; CHECK-NEXT:    vand.vi v0, v16, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vi v0, v16, 8
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vand.vx v0, v16, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v0, v8, v0
-; CHECK-NEXT:    vxor.vv v24, v24, v0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vxor.vv v8, v24, v8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
-  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
-  ret <vscale x 32 x i16> %v
-}
-
-define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv1i32_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2048
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4096
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8192
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 16384
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 32768
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 65536
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 131072
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 262144
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
-  ret <vscale x 1 x i32> %v
+define <vscale x 4 x i16> @clmul_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv4i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv4i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v12, v9
+; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv4i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v12, v9
+; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
+  ret <vscale x 4 x i16> %v
 }
 
-define <vscale x 1 x i32> @clmul_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nounwind {
-; RV32-LABEL: clmul_nxv1i32_vx:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32-NEXT:    andi s6, a0, 2
-; RV32-NEXT:    andi s5, a0, 1
-; RV32-NEXT:    andi s3, a0, 4
-; RV32-NEXT:    andi s1, a0, 8
-; RV32-NEXT:    andi t6, a0, 16
-; RV32-NEXT:    andi t4, a0, 32
-; RV32-NEXT:    andi t2, a0, 64
-; RV32-NEXT:    andi t0, a0, 128
-; RV32-NEXT:    andi a6, a0, 256
-; RV32-NEXT:    andi a4, a0, 512
-; RV32-NEXT:    andi a2, a0, 1024
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    lui a3, 1
-; RV32-NEXT:    lui a5, 2
-; RV32-NEXT:    lui a7, 4
-; RV32-NEXT:    lui t1, 8
-; RV32-NEXT:    lui t3, 16
-; RV32-NEXT:    lui t5, 32
-; RV32-NEXT:    lui s0, 64
-; RV32-NEXT:    lui s2, 128
-; RV32-NEXT:    lui s4, 256
-; RV32-NEXT:    vsetvli s7, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vmul.vx v9, v8, s6
-; RV32-NEXT:    lui s6, 512
-; RV32-NEXT:    vmul.vx v10, v8, s5
-; RV32-NEXT:    lui s5, 1024
-; RV32-NEXT:    vxor.vv v9, v10, v9
-; RV32-NEXT:    vmul.vx v10, v8, s3
-; RV32-NEXT:    lui s3, 2048
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s1
-; RV32-NEXT:    lui s1, 4096
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t6
-; RV32-NEXT:    lui t6, 8192
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t4
-; RV32-NEXT:    lui t4, 16384
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t2
-; RV32-NEXT:    lui t2, 32768
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t0
-; RV32-NEXT:    lui t0, 65536
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a6
-; RV32-NEXT:    lui a6, 131072
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a4
-; RV32-NEXT:    lui a4, 262144
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a2
-; RV32-NEXT:    lui a2, 524288
-; RV32-NEXT:    slli a1, a1, 11
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    and a5, a0, a5
-; RV32-NEXT:    and a7, a0, a7
-; RV32-NEXT:    and t1, a0, t1
-; RV32-NEXT:    and t3, a0, t3
-; RV32-NEXT:    and t5, a0, t5
-; RV32-NEXT:    and s0, a0, s0
-; RV32-NEXT:    and s2, a0, s2
-; RV32-NEXT:    and s4, a0, s4
-; RV32-NEXT:    and s6, a0, s6
-; RV32-NEXT:    and s5, a0, s5
-; RV32-NEXT:    and s3, a0, s3
-; RV32-NEXT:    and s1, a0, s1
-; RV32-NEXT:    and t6, a0, t6
-; RV32-NEXT:    and t4, a0, t4
-; RV32-NEXT:    and t2, a0, t2
-; RV32-NEXT:    and t0, a0, t0
-; RV32-NEXT:    and a6, a0, a6
-; RV32-NEXT:    and a4, a0, a4
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a0
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a3
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a5
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a7
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t1
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t3
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t5
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s0
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s2
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s4
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s6
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s5
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s3
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s1
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t6
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t4
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t2
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t0
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a6
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a4
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v8, v8, a2
-; RV32-NEXT:    vxor.vv v8, v9, v8
-; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 32
-; RV32-NEXT:    ret
+define <vscale x 4 x i16> @clmul_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv4i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
 ;
-; RV64-LABEL: clmul_nxv1i32_vx:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a0
-; RV64-NEXT:    li a0, 16
-; RV64-NEXT:    vand.vi v10, v9, 2
-; RV64-NEXT:    vand.vi v11, v9, 1
-; RV64-NEXT:    vmul.vv v10, v8, v10
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v11, v10
-; RV64-NEXT:    vand.vi v11, v9, 4
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vi v11, v9, 8
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 128
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 256
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 512
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 1024
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 11
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 1
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 2
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 4
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 8
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 32
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 64
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 128
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 256
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 512
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 1024
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 2048
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 4096
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 8192
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 16384
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 32768
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 65536
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 131072
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 262144
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 524288
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vmul.vv v8, v8, v9
-; RV64-NEXT:    vxor.vv v8, v10, v8
-; RV64-NEXT:    ret
-  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
-  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
-  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
-  ret <vscale x 1 x i32> %v
-}
-
-define <vscale x 2 x i32> @clmul_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv2i32_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vi v10, v9, 2
-; CHECK-NEXT:    vand.vi v11, v9, 1
-; CHECK-NEXT:    vmul.vv v10, v8, v10
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v9, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v9, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 32
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 64
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 128
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 256
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 512
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 2048
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 4096
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 8192
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 16384
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 32768
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 65536
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 131072
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 262144
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v9, a0
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vmul.vv v11, v8, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
-  ret <vscale x 2 x i32> %v
+; RV64V-LABEL: clmul_nxv4i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv4i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v16
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv4i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v16
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb)
+  ret <vscale x 4 x i16> %v
 }
 
-define <vscale x 2 x i32> @clmul_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nounwind {
-; RV32-LABEL: clmul_nxv2i32_vx:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32-NEXT:    andi s6, a0, 2
-; RV32-NEXT:    andi s5, a0, 1
-; RV32-NEXT:    andi s3, a0, 4
-; RV32-NEXT:    andi s1, a0, 8
-; RV32-NEXT:    andi t6, a0, 16
-; RV32-NEXT:    andi t4, a0, 32
-; RV32-NEXT:    andi t2, a0, 64
-; RV32-NEXT:    andi t0, a0, 128
-; RV32-NEXT:    andi a6, a0, 256
-; RV32-NEXT:    andi a4, a0, 512
-; RV32-NEXT:    andi a2, a0, 1024
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    lui a3, 1
-; RV32-NEXT:    lui a5, 2
-; RV32-NEXT:    lui a7, 4
-; RV32-NEXT:    lui t1, 8
-; RV32-NEXT:    lui t3, 16
-; RV32-NEXT:    lui t5, 32
-; RV32-NEXT:    lui s0, 64
-; RV32-NEXT:    lui s2, 128
-; RV32-NEXT:    lui s4, 256
-; RV32-NEXT:    vsetvli s7, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmul.vx v9, v8, s6
-; RV32-NEXT:    lui s6, 512
-; RV32-NEXT:    vmul.vx v10, v8, s5
-; RV32-NEXT:    lui s5, 1024
-; RV32-NEXT:    vxor.vv v9, v10, v9
-; RV32-NEXT:    vmul.vx v10, v8, s3
-; RV32-NEXT:    lui s3, 2048
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s1
-; RV32-NEXT:    lui s1, 4096
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t6
-; RV32-NEXT:    lui t6, 8192
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t4
-; RV32-NEXT:    lui t4, 16384
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t2
-; RV32-NEXT:    lui t2, 32768
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t0
-; RV32-NEXT:    lui t0, 65536
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a6
-; RV32-NEXT:    lui a6, 131072
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a4
-; RV32-NEXT:    lui a4, 262144
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a2
-; RV32-NEXT:    lui a2, 524288
-; RV32-NEXT:    slli a1, a1, 11
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    and a5, a0, a5
-; RV32-NEXT:    and a7, a0, a7
-; RV32-NEXT:    and t1, a0, t1
-; RV32-NEXT:    and t3, a0, t3
-; RV32-NEXT:    and t5, a0, t5
-; RV32-NEXT:    and s0, a0, s0
-; RV32-NEXT:    and s2, a0, s2
-; RV32-NEXT:    and s4, a0, s4
-; RV32-NEXT:    and s6, a0, s6
-; RV32-NEXT:    and s5, a0, s5
-; RV32-NEXT:    and s3, a0, s3
-; RV32-NEXT:    and s1, a0, s1
-; RV32-NEXT:    and t6, a0, t6
-; RV32-NEXT:    and t4, a0, t4
-; RV32-NEXT:    and t2, a0, t2
-; RV32-NEXT:    and t0, a0, t0
-; RV32-NEXT:    and a6, a0, a6
-; RV32-NEXT:    and a4, a0, a4
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a0
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a3
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a5
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a7
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t1
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t3
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t5
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s0
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s2
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s4
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s6
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s5
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s3
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, s1
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t6
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t4
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t2
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, t0
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a6
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v10, v8, a4
-; RV32-NEXT:    vxor.vv v9, v9, v10
-; RV32-NEXT:    vmul.vx v8, v8, a2
-; RV32-NEXT:    vxor.vv v8, v9, v8
-; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 32
-; RV32-NEXT:    ret
+define <vscale x 8 x i16> @clmul_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv8i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    ret
 ;
-; RV64-LABEL: clmul_nxv2i32_vx:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a0
-; RV64-NEXT:    li a0, 16
-; RV64-NEXT:    vand.vi v10, v9, 2
-; RV64-NEXT:    vand.vi v11, v9, 1
-; RV64-NEXT:    vmul.vv v10, v8, v10
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v11, v10
-; RV64-NEXT:    vand.vi v11, v9, 4
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vi v11, v9, 8
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 128
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 256
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 512
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 1024
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 11
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 1
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 2
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 4
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 8
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 32
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 64
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 128
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 256
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 512
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 1024
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 2048
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 4096
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 8192
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 16384
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 32768
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 65536
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 131072
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 262144
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vand.vx v11, v9, a0
-; RV64-NEXT:    lui a0, 524288
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vmul.vv v11, v8, v11
-; RV64-NEXT:    vxor.vv v10, v10, v11
-; RV64-NEXT:    vmul.vv v8, v8, v9
-; RV64-NEXT:    vxor.vv v8, v10, v8
-; RV64-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
-  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
-  ret <vscale x 2 x i32> %v
+; RV64V-LABEL: clmul_nxv8i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv8i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v16, v10
+; RV32ZVBC-NEXT:    vzext.vf4 v24, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv8i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v16, v10
+; RV64ZVBC-NEXT:    vzext.vf4 v24, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
+  ret <vscale x 8 x i16> %v
 }
 
-define <vscale x 4 x i32> @clmul_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv4i32_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vi v12, v10, 2
-; CHECK-NEXT:    vand.vi v14, v10, 1
-; CHECK-NEXT:    vmul.vv v12, v8, v12
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v14, v12
-; CHECK-NEXT:    vand.vi v14, v10, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vi v14, v10, 8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 32
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 64
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 128
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 256
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 512
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 1024
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 2048
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 4096
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 8192
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 16384
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 32768
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 65536
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 131072
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 262144
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v10, a0
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vmul.vv v14, v8, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vxor.vv v8, v12, v8
-; CHECK-NEXT:    ret
-  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
-  ret <vscale x 4 x i32> %v
-}
-
-define <vscale x 4 x i32> @clmul_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
-; RV32-LABEL: clmul_nxv4i32_vx:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32-NEXT:    andi s6, a0, 2
-; RV32-NEXT:    andi s5, a0, 1
-; RV32-NEXT:    andi s3, a0, 4
-; RV32-NEXT:    andi s1, a0, 8
-; RV32-NEXT:    andi t6, a0, 16
-; RV32-NEXT:    andi t4, a0, 32
-; RV32-NEXT:    andi t2, a0, 64
-; RV32-NEXT:    andi t0, a0, 128
-; RV32-NEXT:    andi a6, a0, 256
-; RV32-NEXT:    andi a4, a0, 512
-; RV32-NEXT:    andi a2, a0, 1024
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    lui a3, 1
-; RV32-NEXT:    lui a5, 2
-; RV32-NEXT:    lui a7, 4
-; RV32-NEXT:    lui t1, 8
-; RV32-NEXT:    lui t3, 16
-; RV32-NEXT:    lui t5, 32
-; RV32-NEXT:    lui s0, 64
-; RV32-NEXT:    lui s2, 128
-; RV32-NEXT:    lui s4, 256
-; RV32-NEXT:    vsetvli s7, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmul.vx v10, v8, s6
-; RV32-NEXT:    lui s6, 512
-; RV32-NEXT:    vmul.vx v12, v8, s5
-; RV32-NEXT:    lui s5, 1024
-; RV32-NEXT:    vxor.vv v10, v12, v10
-; RV32-NEXT:    vmul.vx v12, v8, s3
-; RV32-NEXT:    lui s3, 2048
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s1
-; RV32-NEXT:    lui s1, 4096
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t6
-; RV32-NEXT:    lui t6, 8192
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t4
-; RV32-NEXT:    lui t4, 16384
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t2
-; RV32-NEXT:    lui t2, 32768
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t0
-; RV32-NEXT:    lui t0, 65536
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a6
-; RV32-NEXT:    lui a6, 131072
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a4
-; RV32-NEXT:    lui a4, 262144
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a2
-; RV32-NEXT:    lui a2, 524288
-; RV32-NEXT:    slli a1, a1, 11
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    and a5, a0, a5
-; RV32-NEXT:    and a7, a0, a7
-; RV32-NEXT:    and t1, a0, t1
-; RV32-NEXT:    and t3, a0, t3
-; RV32-NEXT:    and t5, a0, t5
-; RV32-NEXT:    and s0, a0, s0
-; RV32-NEXT:    and s2, a0, s2
-; RV32-NEXT:    and s4, a0, s4
-; RV32-NEXT:    and s6, a0, s6
-; RV32-NEXT:    and s5, a0, s5
-; RV32-NEXT:    and s3, a0, s3
-; RV32-NEXT:    and s1, a0, s1
-; RV32-NEXT:    and t6, a0, t6
-; RV32-NEXT:    and t4, a0, t4
-; RV32-NEXT:    and t2, a0, t2
-; RV32-NEXT:    and t0, a0, t0
-; RV32-NEXT:    and a6, a0, a6
-; RV32-NEXT:    and a4, a0, a4
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a0
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a3
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a5
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a7
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t1
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t3
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t5
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s0
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s2
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s4
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s6
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s5
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s3
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, s1
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t6
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t4
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t2
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, t0
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a6
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v12, v8, a4
-; RV32-NEXT:    vxor.vv v10, v10, v12
-; RV32-NEXT:    vmul.vx v8, v8, a2
-; RV32-NEXT:    vxor.vv v8, v10, v8
-; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 32
-; RV32-NEXT:    ret
+define <vscale x 8 x i16> @clmul_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmul_nxv8i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    ret
 ;
-; RV64-LABEL: clmul_nxv4i32_vx:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    li a0, 16
-; RV64-NEXT:    vand.vi v12, v10, 2
-; RV64-NEXT:    vand.vi v14, v10, 1
-; RV64-NEXT:    vmul.vv v12, v8, v12
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v14, v12
-; RV64-NEXT:    vand.vi v14, v10, 4
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vi v14, v10, 8
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    li a0, 128
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    li a0, 256
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    li a0, 512
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    li a0, 1024
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 11
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 1
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 2
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 4
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 8
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 32
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 64
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 128
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 256
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 512
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 1024
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 2048
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 4096
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 8192
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 16384
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 32768
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 65536
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 131072
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 262144
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vand.vx v14, v10, a0
-; RV64-NEXT:    lui a0, 524288
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vmul.vv v14, v8, v14
-; RV64-NEXT:    vxor.vv v12, v12, v14
-; RV64-NEXT:    vmul.vv v8, v8, v10
-; RV64-NEXT:    vxor.vv v8, v12, v8
-; RV64-NEXT:    ret
-  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
-  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
-  ret <vscale x 4 x i32> %v
+; RV64V-LABEL: clmul_nxv8i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv8i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v24
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv8i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v24
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb)
+  ret <vscale x 8 x i16> %v
 }
 
-define <vscale x 8 x i32> @clmul_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
-; CHECK-LABEL: clmul_nxv8i32_vv:
+define <vscale x 16 x i16> @clmul_nxv16i16_vv(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb) nounwind {
+; CHECK-LABEL: clmul_nxv16i16_vv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 2
 ; CHECK-NEXT:    vand.vi v20, v12, 1
 ; CHECK-NEXT:    vmul.vv v16, v8, v16
@@ -2661,350 +2384,2551 @@ define <vscale x 8 x i32> @clmul_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
 ; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 32
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 64
-; CHECK-NEXT:    vmul.vv v20, v8, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 128
+; CHECK-NEXT:    vmul.vv v8, v8, v12
+; CHECK-NEXT:    vxor.vv v8, v16, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @clmul_nxv16i16_vx(<vscale x 16 x i16> %va, i16 %b) nounwind {
+; CHECK-LABEL: clmul_nxv16i16_vx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vand.vi v16, v12, 2
+; CHECK-NEXT:    vand.vi v20, v12, 1
+; CHECK-NEXT:    vmul.vv v16, v8, v16
+; CHECK-NEXT:    vmul.vv v20, v8, v20
+; CHECK-NEXT:    vxor.vv v16, v20, v16
+; CHECK-NEXT:    vand.vi v20, v12, 4
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 256
+; CHECK-NEXT:    vand.vi v20, v12, 8
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 512
+; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 1024
+; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 2048
+; CHECK-NEXT:    li a0, 128
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 4096
+; CHECK-NEXT:    li a0, 256
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 8192
+; CHECK-NEXT:    li a0, 512
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 16384
+; CHECK-NEXT:    li a0, 1024
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 32768
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    slli a0, a0, 11
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 65536
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 131072
+; CHECK-NEXT:    lui a0, 2
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 262144
+; CHECK-NEXT:    lui a0, 4
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vand.vx v20, v12, a0
-; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    lui a0, 8
 ; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vmul.vv v20, v8, v20
 ; CHECK-NEXT:    vxor.vv v16, v16, v20
 ; CHECK-NEXT:    vmul.vv v8, v8, v12
 ; CHECK-NEXT:    vxor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.clmul.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 32 x i16> @clmul_nxv32i16_vv(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb) nounwind {
+; CHECK-LABEL: clmul_nxv32i16_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vand.vi v24, v16, 2
+; CHECK-NEXT:    vand.vi v0, v16, 1
+; CHECK-NEXT:    vmul.vv v24, v8, v24
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v0, v24
+; CHECK-NEXT:    vand.vi v0, v16, 4
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vi v0, v16, 8
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 256
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 512
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 1024
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    slli a0, a0, 11
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 2
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 4
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vmul.vv v8, v8, v16
+; CHECK-NEXT:    vxor.vv v8, v24, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @clmul_nxv32i16_vx(<vscale x 32 x i16> %va, i16 %b) nounwind {
+; CHECK-LABEL: clmul_nxv32i16_vx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vand.vi v24, v16, 2
+; CHECK-NEXT:    vand.vi v0, v16, 1
+; CHECK-NEXT:    vmul.vv v24, v8, v24
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v0, v24
+; CHECK-NEXT:    vand.vi v0, v16, 4
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vi v0, v16, 8
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 256
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 512
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 1024
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    slli a0, a0, 11
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 2
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 4
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vand.vx v0, v16, a0
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vmul.vv v0, v8, v0
+; CHECK-NEXT:    vxor.vv v24, v24, v0
+; CHECK-NEXT:    vmul.vv v8, v8, v16
+; CHECK-NEXT:    vxor.vv v8, v24, v8
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.clmul.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 1 x i32> @clmul_nxv1i32_vv(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv1i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv1i32_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf2 v9, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv1i32_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf2 v9, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @clmul_nxv1i32_vx(<vscale x 1 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv1i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vmul.vx v9, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a7
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv1i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv1i32_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf2 v8, v9
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv1i32_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf2 v8, v9
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 2 x i32> @clmul_nxv2i32_vv(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv2i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vand.vi v10, v9, 2
+; RV32V-NEXT:    vand.vi v11, v9, 1
+; RV32V-NEXT:    vmul.vv v10, v8, v10
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v9, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v9, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v9, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v9, v9, a0
+; RV32V-NEXT:    vmul.vv v11, v8, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v8, v9
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv2i32_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC-NEXT:    vclmul.vv v10, v12, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv2i32_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC-NEXT:    vclmul.vv v10, v12, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @clmul_nxv2i32_vx(<vscale x 2 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv2i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vmul.vx v9, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a7
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s5
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s3
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, s1
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t2
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, t0
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a6
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v10, v8, a4
+; RV32V-NEXT:    vxor.vv v9, v9, v10
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv2i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v10, v9, 2
+; RV64V-NEXT:    vand.vi v11, v9, 1
+; RV64V-NEXT:    vmul.vv v10, v8, v10
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v9, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v9, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v9, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v9, v9, a0
+; RV64V-NEXT:    vmul.vv v11, v8, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v8, v9
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv2i32_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf2 v8, v12
+; RV32ZVBC-NEXT:    vclmul.vv v10, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv2i32_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf2 v8, v12
+; RV64ZVBC-NEXT:    vclmul.vv v10, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i32> @clmul_nxv4i32_vv(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv4i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vand.vi v12, v10, 2
+; RV32V-NEXT:    vand.vi v14, v10, 1
+; RV32V-NEXT:    vmul.vv v12, v8, v12
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v10, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v10, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v10, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v10, v10, a0
+; RV32V-NEXT:    vmul.vv v14, v8, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v8, v10
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv4i32_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v12, v10
+; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC-NEXT:    vclmul.vv v12, v16, v12
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv4i32_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v12, v10
+; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC-NEXT:    vclmul.vv v12, v16, v12
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @clmul_nxv4i32_vx(<vscale x 4 x i32> %va, i32 %b) nounwind {
+; RV32V-LABEL: clmul_nxv4i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vmul.vx v10, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v12, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v10, v12, v10
+; RV32V-NEXT:    vmul.vx v12, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a0
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a3
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a5
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a7
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t1
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t3
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t5
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s0
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s2
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s4
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s6
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s5
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s3
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, s1
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t6
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t4
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t2
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, t0
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a6
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v12, v8, a4
+; RV32V-NEXT:    vxor.vv v10, v10, v12
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv4i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v12, v10, 2
+; RV64V-NEXT:    vand.vi v14, v10, 1
+; RV64V-NEXT:    vmul.vv v12, v8, v12
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v10, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v10, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v10, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v10, v10, a0
+; RV64V-NEXT:    vmul.vv v14, v8, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v8, v10
+; RV64V-NEXT:    vxor.vv v8, v12, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv4i32_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v12, v8
+; RV32ZVBC-NEXT:    vzext.vf2 v8, v16
+; RV32ZVBC-NEXT:    vclmul.vv v12, v12, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv4i32_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v12, v8
+; RV64ZVBC-NEXT:    vzext.vf2 v8, v16
+; RV64ZVBC-NEXT:    vclmul.vv v12, v12, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 8 x i32> @clmul_nxv8i32_vv(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb) nounwind {
+; RV32V-LABEL: clmul_nxv8i32_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vand.vi v16, v12, 2
+; RV32V-NEXT:    vand.vi v20, v12, 1
+; RV32V-NEXT:    vmul.vv v16, v8, v16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v12, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v12, 8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 16
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 32
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 64
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 128
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 256
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 512
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 1024
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 2048
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 4096
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 8192
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 16384
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 32768
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 65536
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 131072
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 262144
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v12, a0
+; RV32V-NEXT:    lui a0, 524288
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v20, v8, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v8, v12
+; RV32V-NEXT:    vxor.vv v8, v16, v8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmul_nxv8i32_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv8i32_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v16, v12
+; RV32ZVBC-NEXT:    vzext.vf2 v24, v8
+; RV32ZVBC-NEXT:    vclmul.vv v16, v24, v16
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv8i32_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v16, v12
+; RV64ZVBC-NEXT:    vzext.vf2 v24, v8
+; RV64ZVBC-NEXT:    vclmul.vv v16, v24, v16
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @clmul_nxv8i32_vx(<vscale x 8 x i32> %va, i32 %b) nounwind {
-; RV32-LABEL: clmul_nxv8i32_vx:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RV32-NEXT:    andi s6, a0, 2
-; RV32-NEXT:    andi s5, a0, 1
-; RV32-NEXT:    andi s3, a0, 4
-; RV32-NEXT:    andi s1, a0, 8
-; RV32-NEXT:    andi t6, a0, 16
-; RV32-NEXT:    andi t4, a0, 32
-; RV32-NEXT:    andi t2, a0, 64
-; RV32-NEXT:    andi t0, a0, 128
-; RV32-NEXT:    andi a6, a0, 256
-; RV32-NEXT:    andi a4, a0, 512
-; RV32-NEXT:    andi a2, a0, 1024
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    lui a3, 1
-; RV32-NEXT:    lui a5, 2
-; RV32-NEXT:    lui a7, 4
-; RV32-NEXT:    lui t1, 8
-; RV32-NEXT:    lui t3, 16
-; RV32-NEXT:    lui t5, 32
-; RV32-NEXT:    lui s0, 64
-; RV32-NEXT:    lui s2, 128
-; RV32-NEXT:    lui s4, 256
-; RV32-NEXT:    vsetvli s7, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmul.vx v12, v8, s6
-; RV32-NEXT:    lui s6, 512
-; RV32-NEXT:    vmul.vx v16, v8, s5
-; RV32-NEXT:    lui s5, 1024
-; RV32-NEXT:    vxor.vv v12, v16, v12
-; RV32-NEXT:    vmul.vx v16, v8, s3
-; RV32-NEXT:    lui s3, 2048
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s1
-; RV32-NEXT:    lui s1, 4096
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t6
-; RV32-NEXT:    lui t6, 8192
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t4
-; RV32-NEXT:    lui t4, 16384
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t2
-; RV32-NEXT:    lui t2, 32768
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t0
-; RV32-NEXT:    lui t0, 65536
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a6
-; RV32-NEXT:    lui a6, 131072
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a4
-; RV32-NEXT:    lui a4, 262144
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a2
-; RV32-NEXT:    lui a2, 524288
-; RV32-NEXT:    slli a1, a1, 11
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    and a5, a0, a5
-; RV32-NEXT:    and a7, a0, a7
-; RV32-NEXT:    and t1, a0, t1
-; RV32-NEXT:    and t3, a0, t3
-; RV32-NEXT:    and t5, a0, t5
-; RV32-NEXT:    and s0, a0, s0
-; RV32-NEXT:    and s2, a0, s2
-; RV32-NEXT:    and s4, a0, s4
-; RV32-NEXT:    and s6, a0, s6
-; RV32-NEXT:    and s5, a0, s5
-; RV32-NEXT:    and s3, a0, s3
-; RV32-NEXT:    and s1, a0, s1
-; RV32-NEXT:    and t6, a0, t6
-; RV32-NEXT:    and t4, a0, t4
-; RV32-NEXT:    and t2, a0, t2
-; RV32-NEXT:    and t0, a0, t0
-; RV32-NEXT:    and a6, a0, a6
-; RV32-NEXT:    and a4, a0, a4
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a0
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a3
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a5
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a7
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t1
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t3
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t5
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s0
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s2
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s4
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s6
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s5
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s3
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, s1
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t6
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t4
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t2
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, t0
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a6
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v16, v8, a4
-; RV32-NEXT:    vxor.vv v12, v12, v16
-; RV32-NEXT:    vmul.vx v8, v8, a2
-; RV32-NEXT:    vxor.vv v8, v12, v8
-; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 32
-; RV32-NEXT:    ret
+; RV32V-LABEL: clmul_nxv8i32_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi sp, sp, -32
+; RV32V-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RV32V-NEXT:    andi s6, a0, 2
+; RV32V-NEXT:    andi s5, a0, 1
+; RV32V-NEXT:    andi s3, a0, 4
+; RV32V-NEXT:    andi s1, a0, 8
+; RV32V-NEXT:    andi t6, a0, 16
+; RV32V-NEXT:    andi t4, a0, 32
+; RV32V-NEXT:    andi t2, a0, 64
+; RV32V-NEXT:    andi t0, a0, 128
+; RV32V-NEXT:    andi a6, a0, 256
+; RV32V-NEXT:    andi a4, a0, 512
+; RV32V-NEXT:    andi a2, a0, 1024
+; RV32V-NEXT:    li a1, 1
+; RV32V-NEXT:    lui a3, 1
+; RV32V-NEXT:    lui a5, 2
+; RV32V-NEXT:    lui a7, 4
+; RV32V-NEXT:    lui t1, 8
+; RV32V-NEXT:    lui t3, 16
+; RV32V-NEXT:    lui t5, 32
+; RV32V-NEXT:    lui s0, 64
+; RV32V-NEXT:    lui s2, 128
+; RV32V-NEXT:    lui s4, 256
+; RV32V-NEXT:    vsetvli s7, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vmul.vx v12, v8, s6
+; RV32V-NEXT:    lui s6, 512
+; RV32V-NEXT:    vmul.vx v16, v8, s5
+; RV32V-NEXT:    lui s5, 1024
+; RV32V-NEXT:    vxor.vv v12, v16, v12
+; RV32V-NEXT:    vmul.vx v16, v8, s3
+; RV32V-NEXT:    lui s3, 2048
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s1
+; RV32V-NEXT:    lui s1, 4096
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t6
+; RV32V-NEXT:    lui t6, 8192
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t4
+; RV32V-NEXT:    lui t4, 16384
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t2
+; RV32V-NEXT:    lui t2, 32768
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t0
+; RV32V-NEXT:    lui t0, 65536
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a6
+; RV32V-NEXT:    lui a6, 131072
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a4
+; RV32V-NEXT:    lui a4, 262144
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a2
+; RV32V-NEXT:    lui a2, 524288
+; RV32V-NEXT:    slli a1, a1, 11
+; RV32V-NEXT:    and a3, a0, a3
+; RV32V-NEXT:    and a5, a0, a5
+; RV32V-NEXT:    and a7, a0, a7
+; RV32V-NEXT:    and t1, a0, t1
+; RV32V-NEXT:    and t3, a0, t3
+; RV32V-NEXT:    and t5, a0, t5
+; RV32V-NEXT:    and s0, a0, s0
+; RV32V-NEXT:    and s2, a0, s2
+; RV32V-NEXT:    and s4, a0, s4
+; RV32V-NEXT:    and s6, a0, s6
+; RV32V-NEXT:    and s5, a0, s5
+; RV32V-NEXT:    and s3, a0, s3
+; RV32V-NEXT:    and s1, a0, s1
+; RV32V-NEXT:    and t6, a0, t6
+; RV32V-NEXT:    and t4, a0, t4
+; RV32V-NEXT:    and t2, a0, t2
+; RV32V-NEXT:    and t0, a0, t0
+; RV32V-NEXT:    and a6, a0, a6
+; RV32V-NEXT:    and a4, a0, a4
+; RV32V-NEXT:    and a2, a0, a2
+; RV32V-NEXT:    and a0, a0, a1
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a0
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a3
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a5
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a7
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t1
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t3
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t5
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s0
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s2
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s4
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s6
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s5
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s3
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, s1
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t6
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t4
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t2
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, t0
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a6
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v16, v8, a4
+; RV32V-NEXT:    vxor.vv v12, v12, v16
+; RV32V-NEXT:    vmul.vx v8, v8, a2
+; RV32V-NEXT:    vxor.vv v8, v12, v8
+; RV32V-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32V-NEXT:    addi sp, sp, 32
+; RV32V-NEXT:    ret
 ;
-; RV64-LABEL: clmul_nxv8i32_vx:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a0
-; RV64-NEXT:    li a0, 16
-; RV64-NEXT:    vand.vi v16, v12, 2
-; RV64-NEXT:    vand.vi v20, v12, 1
-; RV64-NEXT:    vmul.vv v16, v8, v16
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v20, v16
-; RV64-NEXT:    vand.vi v20, v12, 4
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vi v20, v12, 8
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    li a0, 128
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    li a0, 256
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    li a0, 512
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    li a0, 1024
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 11
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 1
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 2
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 4
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 8
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 16
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 32
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 64
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 128
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 256
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 512
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 1024
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 2048
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 4096
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 8192
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 16384
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 32768
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 65536
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 131072
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 262144
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vand.vx v20, v12, a0
-; RV64-NEXT:    lui a0, 524288
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vmul.vv v20, v8, v20
-; RV64-NEXT:    vxor.vv v16, v16, v20
-; RV64-NEXT:    vmul.vv v8, v8, v12
-; RV64-NEXT:    vxor.vv v8, v16, v8
-; RV64-NEXT:    ret
+; RV64V-LABEL: clmul_nxv8i32_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vmv.v.x v12, a0
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v16, v12, 2
+; RV64V-NEXT:    vand.vi v20, v12, 1
+; RV64V-NEXT:    vmul.vv v16, v8, v16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v12, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v12, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 64
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 128
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 256
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 512
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 2048
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 4096
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 8192
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 16384
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 32768
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 65536
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 131072
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 262144
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v12, a0
+; RV64V-NEXT:    lui a0, 524288
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v20, v8, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v8, v12
+; RV64V-NEXT:    vxor.vv v8, v16, v8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmul_nxv8i32_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV32ZVBC-NEXT:    vzext.vf2 v8, v24
+; RV32ZVBC-NEXT:    vclmul.vv v16, v16, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmul_nxv8i32_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf2 v16, v8
+; RV64ZVBC-NEXT:    vzext.vf2 v8, v24
+; RV64ZVBC-NEXT:    vclmul.vv v16, v16, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 0
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb)
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
index 2bb0603838fd6..e8d33a5b10bd7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll
@@ -5,41 +5,105 @@
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v,+zvbc < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVBC
 
 define <vscale x 1 x i8> @clmulh_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv1i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv1i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv1i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv1i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v9, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv1i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v9, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC-NEXT:    ret
   %va.ext = zext <vscale x 1 x i8> %va to <vscale x 1 x i16>
   %vb.ext = zext <vscale x 1 x i8> %vb to <vscale x 1 x i16>
   %clmul = call <vscale x 1 x i16> @llvm.clmul.nxv1i16(<vscale x 1 x i16> %va.ext, <vscale x 1 x i16> %vb.ext)
@@ -49,43 +113,113 @@ define <vscale x 1 x i8> @clmulh_nxv1i8_vv(<vscale x 1 x i8> %va, <vscale x 1 x
 }
 
 define <vscale x 1 x i8> @clmulh_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv1i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv1i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv1i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv1i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v9
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv1i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v9
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %va.ext = zext <vscale x 1 x i8> %va to <vscale x 1 x i16>
@@ -97,41 +231,105 @@ define <vscale x 1 x i8> @clmulh_nxv1i8_vx(<vscale x 1 x i8> %va, i8 %b) nounwin
 }
 
 define <vscale x 2 x i8> @clmulh_nxv2i8_vv(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv2i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv2i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv2i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv2i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv2i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC-NEXT:    ret
   %va.ext = zext <vscale x 2 x i8> %va to <vscale x 2 x i16>
   %vb.ext = zext <vscale x 2 x i8> %vb to <vscale x 2 x i16>
   %clmul = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va.ext, <vscale x 2 x i16> %vb.ext)
@@ -141,135 +339,339 @@ define <vscale x 2 x i8> @clmulh_nxv2i8_vv(<vscale x 2 x i8> %va, <vscale x 2 x
 }
 
 define <vscale x 2 x i8> @clmulh_nxv2i8_vx(<vscale x 2 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv2i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
-  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
-  %va.ext = zext <vscale x 2 x i8> %va to <vscale x 2 x i16>
-  %vb.ext = zext <vscale x 2 x i8> %vb to <vscale x 2 x i16>
-  %clmul = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va.ext, <vscale x 2 x i16> %vb.ext)
-  %res.ext = lshr <vscale x 2 x i16> %clmul, splat(i16 8)
-  %res = trunc <vscale x 2 x i16> %res.ext to <vscale x 2 x i8>
-  ret <vscale x 2 x i8> %res
-}
-
-define <vscale x 4 x i8> @clmulh_nxv4i8_vv(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv4i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    ret
-  %va.ext = zext <vscale x 4 x i8> %va to <vscale x 4 x i16>
-  %vb.ext = zext <vscale x 4 x i8> %vb to <vscale x 4 x i16>
-  %clmul = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va.ext, <vscale x 4 x i16> %vb.ext)
-  %res.ext = lshr <vscale x 4 x i16> %clmul, splat(i16 8)
-  %res = trunc <vscale x 4 x i16> %res.ext to <vscale x 4 x i8>
-  ret <vscale x 4 x i8> %res
-}
-
-define <vscale x 4 x i8> @clmulh_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv4i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv2i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv2i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv2i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v12
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv2i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v12
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %va.ext = zext <vscale x 2 x i8> %va to <vscale x 2 x i16>
+  %vb.ext = zext <vscale x 2 x i8> %vb to <vscale x 2 x i16>
+  %clmul = call <vscale x 2 x i16> @llvm.clmul.nxv2i16(<vscale x 2 x i16> %va.ext, <vscale x 2 x i16> %vb.ext)
+  %res.ext = lshr <vscale x 2 x i16> %clmul, splat(i16 8)
+  %res = trunc <vscale x 2 x i16> %res.ext to <vscale x 2 x i8>
+  ret <vscale x 2 x i8> %res
+}
+
+define <vscale x 4 x i8> @clmulh_nxv4i8_vv(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv4i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv4i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv4i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v12, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv4i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v12, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 4 x i8> %va to <vscale x 4 x i16>
+  %vb.ext = zext <vscale x 4 x i8> %vb to <vscale x 4 x i16>
+  %clmul = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %va.ext, <vscale x 4 x i16> %vb.ext)
+  %res.ext = lshr <vscale x 4 x i16> %clmul, splat(i16 8)
+  %res = trunc <vscale x 4 x i16> %res.ext to <vscale x 4 x i8>
+  ret <vscale x 4 x i8> %res
+}
+
+define <vscale x 4 x i8> @clmulh_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv4i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32V-NEXT:    vmv.v.x v9, a0
+; RV32V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv4i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v9, a0
+; RV64V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv4i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v16
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv4i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v12, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v16
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 8
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
   %va.ext = zext <vscale x 4 x i8> %va to <vscale x 4 x i16>
@@ -281,96 +683,230 @@ define <vscale x 4 x i8> @clmulh_nxv4i8_vx(<vscale x 4 x i8> %va, i8 %b) nounwin
 }
 
 define <vscale x 8 x i8> @clmulh_nxv8i8_vv(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv8i8_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v8, v12, 2
-; CHECK-NEXT:    vand.vi v14, v12, 1
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v14, v8
-; CHECK-NEXT:    vand.vi v14, v12, 4
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vi v14, v12, 8
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vmul.vv v10, v10, v12
-; CHECK-NEXT:    vxor.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 8
-; CHECK-NEXT:    ret
-  %va.ext = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
-  %vb.ext = zext <vscale x 8 x i8> %vb to <vscale x 8 x i16>
-  %clmul = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va.ext, <vscale x 8 x i16> %vb.ext)
-  %res.ext = lshr <vscale x 8 x i16> %clmul, splat(i16 8)
-  %res = trunc <vscale x 8 x i16> %res.ext to <vscale x 8 x i8>
-  ret <vscale x 8 x i8> %res
-}
-
-define <vscale x 8 x i8> @clmulh_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv8i8_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v12
-; CHECK-NEXT:    vand.vi v12, v8, 2
-; CHECK-NEXT:    vand.vi v14, v8, 1
-; CHECK-NEXT:    vmul.vv v12, v10, v12
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v14, v12
-; CHECK-NEXT:    vand.vi v14, v8, 4
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vi v14, v8, 8
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v10, v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 8
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
-  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-  %va.ext = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
-  %vb.ext = zext <vscale x 8 x i8> %vb to <vscale x 8 x i16>
-  %clmul = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va.ext, <vscale x 8 x i16> %vb.ext)
-  %res.ext = lshr <vscale x 8 x i16> %clmul, splat(i16 8)
-  %res = trunc <vscale x 8 x i16> %res.ext to <vscale x 8 x i8>
-  ret <vscale x 8 x i8> %res
-}
+; RV32V-LABEL: clmulh_nxv8i8_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v12, v9
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v8, v12, 2
+; RV32V-NEXT:    vand.vi v14, v12, 1
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v14, v8
+; RV32V-NEXT:    vand.vi v14, v12, 4
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vi v14, v12, 8
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vmul.vv v10, v10, v12
+; RV32V-NEXT:    vxor.vv v10, v8, v10
+; RV32V-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v10, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv8i8_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v12, v9
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v8, v12, 2
+; RV64V-NEXT:    vand.vi v14, v12, 1
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v14, v8
+; RV64V-NEXT:    vand.vi v14, v12, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vi v14, v12, 8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vmul.vv v10, v10, v12
+; RV64V-NEXT:    vxor.vv v10, v8, v10
+; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v10, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv8i8_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v16, v9
+; RV32ZVBC-NEXT:    vzext.vf8 v24, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv8i8_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v16, v9
+; RV64ZVBC-NEXT:    vzext.vf8 v24, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
+  %vb.ext = zext <vscale x 8 x i8> %vb to <vscale x 8 x i16>
+  %clmul = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va.ext, <vscale x 8 x i16> %vb.ext)
+  %res.ext = lshr <vscale x 8 x i16> %clmul, splat(i16 8)
+  %res = trunc <vscale x 8 x i16> %res.ext to <vscale x 8 x i8>
+  ret <vscale x 8 x i8> %res
+}
+
+define <vscale x 8 x i8> @clmulh_nxv8i8_vx(<vscale x 8 x i8> %va, i8 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv8i8_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v12, a0
+; RV32V-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v12
+; RV32V-NEXT:    vand.vi v12, v8, 2
+; RV32V-NEXT:    vand.vi v14, v8, 1
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v8, 4
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v8, 8
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v10, v12, v8
+; RV32V-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v10, 8
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv8i8_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v12, a0
+; RV64V-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v12
+; RV64V-NEXT:    vand.vi v12, v8, 2
+; RV64V-NEXT:    vand.vi v14, v8, 1
+; RV64V-NEXT:    vmul.vv v12, v10, v12
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v8, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v8, 8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v10, v12, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v10, 8
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv8i8_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV32ZVBC-NEXT:    vzext.vf8 v8, v24
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 8
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv8i8_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf8 v16, v8
+; RV64ZVBC-NEXT:    vzext.vf8 v8, v24
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v16, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 8
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %va.ext = zext <vscale x 8 x i8> %va to <vscale x 8 x i16>
+  %vb.ext = zext <vscale x 8 x i8> %vb to <vscale x 8 x i16>
+  %clmul = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %va.ext, <vscale x 8 x i16> %vb.ext)
+  %res.ext = lshr <vscale x 8 x i16> %clmul, splat(i16 8)
+  %res = trunc <vscale x 8 x i16> %res.ext to <vscale x 8 x i8>
+  ret <vscale x 8 x i8> %res
+}
 
 define <vscale x 16 x i8> @clmulh_nxv16i8_vv(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb) nounwind {
 ; CHECK-LABEL: clmulh_nxv16i8_vv:
@@ -1060,390 +1596,867 @@ define <vscale x 64 x i8> @clmulh_nxv64i8_vx(<vscale x 64 x i8> %va, i8 %b) noun
 }
 
 define <vscale x 1 x i16> @clmulh_nxv1i16_vv(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv1i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 16
-; CHECK-NEXT:    ret
-  %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
-  %vb.ext = zext <vscale x 1 x i16> %vb to <vscale x 1 x i32>
-  %clmul = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va.ext, <vscale x 1 x i32> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i32> %clmul, splat(i32 16)
-  %res = trunc <vscale x 1 x i32> %res.ext to <vscale x 1 x i16>
-  ret <vscale x 1 x i16> %res
-}
-
-define <vscale x 1 x i16> @clmulh_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv1i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v10
-; CHECK-NEXT:    vand.vi v10, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v10, v9, v10
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v9, v8
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 16
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
-  %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
-  %vb.ext = zext <vscale x 1 x i16> %vb to <vscale x 1 x i32>
-  %clmul = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va.ext, <vscale x 1 x i32> %vb.ext)
-  %res.ext = lshr <vscale x 1 x i32> %clmul, splat(i32 16)
-  %res = trunc <vscale x 1 x i32> %res.ext to <vscale x 1 x i16>
-  ret <vscale x 1 x i16> %res
-}
-
-define <vscale x 2 x i16> @clmulh_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv2i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v9, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v9, v10, v9
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v11, v9
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v10, v11
-; CHECK-NEXT:    vxor.vv v9, v9, v11
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v8, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 16
-; CHECK-NEXT:    ret
-  %va.ext = zext <vscale x 2 x i16> %va to <vscale x 2 x i32>
-  %vb.ext = zext <vscale x 2 x i16> %vb to <vscale x 2 x i32>
-  %clmul = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va.ext, <vscale x 2 x i32> %vb.ext)
-  %res.ext = lshr <vscale x 2 x i32> %clmul, splat(i32 16)
-  %res = trunc <vscale x 2 x i32> %res.ext to <vscale x 2 x i16>
-  ret <vscale x 2 x i16> %res
-}
-
-define <vscale x 2 x i16> @clmulh_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv2i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v10
-; CHECK-NEXT:    vand.vi v10, v8, 2
-; CHECK-NEXT:    vand.vi v11, v8, 1
-; CHECK-NEXT:    vmul.vv v10, v9, v10
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v11, v10
-; CHECK-NEXT:    vand.vi v11, v8, 4
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vi v11, v8, 8
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vand.vx v11, v8, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v11, v9, v11
-; CHECK-NEXT:    vxor.vv v10, v10, v11
-; CHECK-NEXT:    vmul.vv v8, v9, v8
-; CHECK-NEXT:    vxor.vv v8, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 16
-; CHECK-NEXT:    ret
-  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
-  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
-  %va.ext = zext <vscale x 2 x i16> %va to <vscale x 2 x i32>
-  %vb.ext = zext <vscale x 2 x i16> %vb to <vscale x 2 x i32>
-  %clmul = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va.ext, <vscale x 2 x i32> %vb.ext)
-  %res.ext = lshr <vscale x 2 x i32> %clmul, splat(i32 16)
-  %res = trunc <vscale x 2 x i32> %res.ext to <vscale x 2 x i16>
-  ret <vscale x 2 x i16> %res
-}
-
-define <vscale x 4 x i16> @clmulh_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv4i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v12, v9
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v8, v12, 2
-; CHECK-NEXT:    vand.vi v14, v12, 1
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v14, v8
-; CHECK-NEXT:    vand.vi v14, v12, 4
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vi v14, v12, 8
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vand.vx v14, v12, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v8, v8, v14
-; CHECK-NEXT:    vmul.vv v10, v10, v12
-; CHECK-NEXT:    vxor.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 16
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv1i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv1i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv1i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf4 v9, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv1i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf4 v9, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v9, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
+  %vb.ext = zext <vscale x 1 x i16> %vb to <vscale x 1 x i32>
+  %clmul = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va.ext, <vscale x 1 x i32> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i32> %clmul, splat(i32 16)
+  %res = trunc <vscale x 1 x i32> %res.ext to <vscale x 1 x i16>
+  ret <vscale x 1 x i16> %res
+}
+
+define <vscale x 1 x i16> @clmulh_nxv1i16_vx(<vscale x 1 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv1i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v9, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v10
+; RV32V-NEXT:    vand.vi v10, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v10, v9, v10
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv1i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v9, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v10
+; RV64V-NEXT:    vand.vi v10, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v10, v9, v10
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v9, v8
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv1i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v9, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v9
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v8, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv1i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v9, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v9
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v8, 16
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %va.ext = zext <vscale x 1 x i16> %va to <vscale x 1 x i32>
+  %vb.ext = zext <vscale x 1 x i16> %vb to <vscale x 1 x i32>
+  %clmul = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %va.ext, <vscale x 1 x i32> %vb.ext)
+  %res.ext = lshr <vscale x 1 x i32> %clmul, splat(i32 16)
+  %res = trunc <vscale x 1 x i32> %res.ext to <vscale x 1 x i16>
+  ret <vscale x 1 x i16> %res
+}
+
+define <vscale x 2 x i16> @clmulh_nxv2i16_vv(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv2i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v8, v9
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v9, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v9, v10, v9
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v11, v9
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v10, v11
+; RV32V-NEXT:    vxor.vv v9, v9, v11
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v8, v9, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv2i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v8, v9
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v9, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v9, v10, v9
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v11, v9
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v10, v11
+; RV64V-NEXT:    vxor.vv v9, v9, v11
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v8, v9, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv2i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv2i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v9
+; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v10
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC-NEXT:    ret
+  %va.ext = zext <vscale x 2 x i16> %va to <vscale x 2 x i32>
+  %vb.ext = zext <vscale x 2 x i16> %vb to <vscale x 2 x i32>
+  %clmul = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va.ext, <vscale x 2 x i32> %vb.ext)
+  %res.ext = lshr <vscale x 2 x i32> %clmul, splat(i32 16)
+  %res = trunc <vscale x 2 x i32> %res.ext to <vscale x 2 x i16>
+  ret <vscale x 2 x i16> %res
+}
+
+define <vscale x 2 x i16> @clmulh_nxv2i16_vx(<vscale x 2 x i16> %va, i16 %b) nounwind {
+; RV32V-LABEL: clmulh_nxv2i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vmv.v.x v10, a0
+; RV32V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32V-NEXT:    vzext.vf2 v9, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v10
+; RV32V-NEXT:    vand.vi v10, v8, 2
+; RV32V-NEXT:    vand.vi v11, v8, 1
+; RV32V-NEXT:    vmul.vv v10, v9, v10
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v11, v10
+; RV32V-NEXT:    vand.vi v11, v8, 4
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vi v11, v8, 8
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vand.vx v11, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v11, v9, v11
+; RV32V-NEXT:    vxor.vv v10, v10, v11
+; RV32V-NEXT:    vmul.vv v8, v9, v8
+; RV32V-NEXT:    vxor.vv v8, v10, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v8, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv2i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vzext.vf2 v9, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v10
+; RV64V-NEXT:    vand.vi v10, v8, 2
+; RV64V-NEXT:    vand.vi v11, v8, 1
+; RV64V-NEXT:    vmul.vv v10, v9, v10
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v11, v10
+; RV64V-NEXT:    vand.vi v11, v8, 4
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vi v11, v8, 8
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vand.vx v11, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v11, v9, v11
+; RV64V-NEXT:    vxor.vv v10, v10, v11
+; RV64V-NEXT:    vmul.vv v8, v9, v8
+; RV64V-NEXT:    vxor.vv v8, v10, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v8, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv2i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v12, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v12
+; RV32ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v10, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv2i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v12, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v10, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v12
+; RV64ZVBC-NEXT:    vclmul.vv v8, v10, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v10, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v10, 16
+; RV64ZVBC-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %va.ext = zext <vscale x 2 x i16> %va to <vscale x 2 x i32>
+  %vb.ext = zext <vscale x 2 x i16> %vb to <vscale x 2 x i32>
+  %clmul = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %va.ext, <vscale x 2 x i32> %vb.ext)
+  %res.ext = lshr <vscale x 2 x i32> %clmul, splat(i32 16)
+  %res = trunc <vscale x 2 x i32> %res.ext to <vscale x 2 x i16>
+  ret <vscale x 2 x i16> %res
+}
+
+define <vscale x 4 x i16> @clmulh_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb) nounwind {
+; RV32V-LABEL: clmulh_nxv4i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    vzext.vf2 v12, v9
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v8, v12, 2
+; RV32V-NEXT:    vand.vi v14, v12, 1
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v14, v8
+; RV32V-NEXT:    vand.vi v14, v12, 4
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vi v14, v12, 8
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vand.vx v14, v12, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v12, v12, a0
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v8, v8, v14
+; RV32V-NEXT:    vmul.vv v10, v10, v12
+; RV32V-NEXT:    vxor.vv v10, v8, v10
+; RV32V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v10, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv4i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    vzext.vf2 v12, v9
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v8, v12, 2
+; RV64V-NEXT:    vand.vi v14, v12, 1
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v14, v8
+; RV64V-NEXT:    vand.vi v14, v12, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vi v14, v12, 8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vand.vx v14, v12, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v12, v12, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v8, v8, v14
+; RV64V-NEXT:    vmul.vv v10, v10, v12
+; RV64V-NEXT:    vxor.vv v10, v8, v10
+; RV64V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v10, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv4i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v12, v9
+; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv4i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v12, v9
+; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v12
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC-NEXT:    ret
   %va.ext = zext <vscale x 4 x i16> %va to <vscale x 4 x i32>
   %vb.ext = zext <vscale x 4 x i16> %vb to <vscale x 4 x i32>
   %clmul = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %va.ext, <vscale x 4 x i32> %vb.ext)
@@ -1453,76 +2466,175 @@ define <vscale x 4 x i16> @clmulh_nxv4i16_vv(<vscale x 4 x i16> %va, <vscale x 4
 }
 
 define <vscale x 4 x i16> @clmulh_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv4i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v12
-; CHECK-NEXT:    vand.vi v12, v8, 2
-; CHECK-NEXT:    vand.vi v14, v8, 1
-; CHECK-NEXT:    vmul.vv v12, v10, v12
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v14, v12
-; CHECK-NEXT:    vand.vi v14, v8, 4
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vi v14, v8, 8
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vand.vx v14, v8, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v14, v10, v14
-; CHECK-NEXT:    vxor.vv v12, v12, v14
-; CHECK-NEXT:    vmul.vv v8, v10, v8
-; CHECK-NEXT:    vxor.vv v10, v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 16
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv4i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vmv.v.x v12, a0
+; RV32V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32V-NEXT:    vzext.vf2 v10, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v12
+; RV32V-NEXT:    vand.vi v12, v8, 2
+; RV32V-NEXT:    vand.vi v14, v8, 1
+; RV32V-NEXT:    vmul.vv v12, v10, v12
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v14, v12
+; RV32V-NEXT:    vand.vi v14, v8, 4
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vi v14, v8, 8
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vand.vx v14, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v14, v10, v14
+; RV32V-NEXT:    vxor.vv v12, v12, v14
+; RV32V-NEXT:    vmul.vv v8, v10, v8
+; RV32V-NEXT:    vxor.vv v10, v12, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v10, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv4i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vmv.v.x v12, a0
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vzext.vf2 v10, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v12
+; RV64V-NEXT:    vand.vi v12, v8, 2
+; RV64V-NEXT:    vand.vi v14, v8, 1
+; RV64V-NEXT:    vmul.vv v12, v10, v12
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v14, v12
+; RV64V-NEXT:    vand.vi v14, v8, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vi v14, v8, 8
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vand.vx v14, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v14, v10, v14
+; RV64V-NEXT:    vxor.vv v12, v12, v14
+; RV64V-NEXT:    vmul.vv v8, v10, v8
+; RV64V-NEXT:    vxor.vv v10, v12, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v10, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv4i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v16, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v16
+; RV32ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v12, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv4i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v16, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v12, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v16
+; RV64ZVBC-NEXT:    vclmul.vv v8, v12, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v12, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v12, 16
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
   %va.ext = zext <vscale x 4 x i16> %va to <vscale x 4 x i32>
@@ -1534,74 +2646,167 @@ define <vscale x 4 x i16> @clmulh_nxv4i16_vx(<vscale x 4 x i16> %va, i16 %b) nou
 }
 
 define <vscale x 8 x i16> @clmulh_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb) nounwind {
-; CHECK-LABEL: clmulh_nxv8i16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vzext.vf2 v16, v10
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vand.vi v8, v16, 2
-; CHECK-NEXT:    vand.vi v20, v16, 1
-; CHECK-NEXT:    vmul.vv v8, v12, v8
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v20, v8
-; CHECK-NEXT:    vand.vi v20, v16, 4
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vi v20, v16, 8
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vand.vx v20, v16, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v8, v8, v20
-; CHECK-NEXT:    vmul.vv v12, v12, v16
-; CHECK-NEXT:    vxor.vv v12, v8, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v12, 16
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv8i16_vv:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    vzext.vf2 v16, v10
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vand.vi v8, v16, 2
+; RV32V-NEXT:    vand.vi v20, v16, 1
+; RV32V-NEXT:    vmul.vv v8, v12, v8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v20, v8
+; RV32V-NEXT:    vand.vi v20, v16, 4
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vi v20, v16, 8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vand.vx v20, v16, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v16, v16, a0
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v8, v8, v20
+; RV32V-NEXT:    vmul.vv v12, v12, v16
+; RV32V-NEXT:    vxor.vv v12, v8, v12
+; RV32V-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v12, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv8i16_vv:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    vzext.vf2 v16, v10
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vand.vi v8, v16, 2
+; RV64V-NEXT:    vand.vi v20, v16, 1
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v20, v8
+; RV64V-NEXT:    vand.vi v20, v16, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vi v20, v16, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vand.vx v20, v16, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v16, v16, a0
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v8, v8, v20
+; RV64V-NEXT:    vmul.vv v12, v12, v16
+; RV64V-NEXT:    vxor.vv v12, v8, v12
+; RV64V-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v12, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv8i16_vv:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v16, v10
+; RV32ZVBC-NEXT:    vzext.vf4 v24, v8
+; RV32ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv8i16_vv:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v16, v10
+; RV64ZVBC-NEXT:    vzext.vf4 v24, v8
+; RV64ZVBC-NEXT:    vclmul.vv v8, v24, v16
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 16
+; RV64ZVBC-NEXT:    ret
   %va.ext = zext <vscale x 8 x i16> %va to <vscale x 8 x i32>
   %vb.ext = zext <vscale x 8 x i16> %vb to <vscale x 8 x i32>
   %clmul = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %va.ext, <vscale x 8 x i32> %vb.ext)
@@ -1611,76 +2816,175 @@ define <vscale x 8 x i16> @clmulh_nxv8i16_vv(<vscale x 8 x i16> %va, <vscale x 8
 }
 
 define <vscale x 8 x i16> @clmulh_nxv8i16_vx(<vscale x 8 x i16> %va, i16 %b) nounwind {
-; CHECK-LABEL: clmulh_nxv8i16_vx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vzext.vf2 v8, v16
-; CHECK-NEXT:    vand.vi v16, v8, 2
-; CHECK-NEXT:    vand.vi v20, v8, 1
-; CHECK-NEXT:    vmul.vv v16, v12, v16
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v20, v16
-; CHECK-NEXT:    vand.vi v20, v8, 4
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vi v20, v8, 8
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 512
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 1024
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    slli a0, a0, 11
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    lui a0, 4
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vand.vx v20, v8, a0
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vmul.vv v20, v12, v20
-; CHECK-NEXT:    vxor.vv v16, v16, v20
-; CHECK-NEXT:    vmul.vv v8, v12, v8
-; CHECK-NEXT:    vxor.vv v12, v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v12, 16
-; CHECK-NEXT:    ret
+; RV32V-LABEL: clmulh_nxv8i16_vx:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vmv.v.x v16, a0
+; RV32V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32V-NEXT:    vzext.vf2 v12, v8
+; RV32V-NEXT:    li a0, 16
+; RV32V-NEXT:    vzext.vf2 v8, v16
+; RV32V-NEXT:    vand.vi v16, v8, 2
+; RV32V-NEXT:    vand.vi v20, v8, 1
+; RV32V-NEXT:    vmul.vv v16, v12, v16
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v20, v16
+; RV32V-NEXT:    vand.vi v20, v8, 4
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vi v20, v8, 8
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 32
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 64
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 128
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 256
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 512
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 1024
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    li a0, 1
+; RV32V-NEXT:    slli a0, a0, 11
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    lui a0, 1
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    lui a0, 2
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    lui a0, 4
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vand.vx v20, v8, a0
+; RV32V-NEXT:    lui a0, 8
+; RV32V-NEXT:    vand.vx v8, v8, a0
+; RV32V-NEXT:    vmul.vv v20, v12, v20
+; RV32V-NEXT:    vxor.vv v16, v16, v20
+; RV32V-NEXT:    vmul.vv v8, v12, v8
+; RV32V-NEXT:    vxor.vv v12, v16, v8
+; RV32V-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32V-NEXT:    vnsrl.wi v8, v12, 16
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: clmulh_nxv8i16_vx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v16, a0
+; RV64V-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64V-NEXT:    vzext.vf2 v12, v8
+; RV64V-NEXT:    li a0, 16
+; RV64V-NEXT:    vzext.vf2 v8, v16
+; RV64V-NEXT:    vand.vi v16, v8, 2
+; RV64V-NEXT:    vand.vi v20, v8, 1
+; RV64V-NEXT:    vmul.vv v16, v12, v16
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v20, v16
+; RV64V-NEXT:    vand.vi v20, v8, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vi v20, v8, 8
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 32
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 64
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 128
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 256
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 1024
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    li a0, 1
+; RV64V-NEXT:    slli a0, a0, 11
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    lui a0, 1
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    lui a0, 2
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    lui a0, 4
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vand.vx v20, v8, a0
+; RV64V-NEXT:    lui a0, 8
+; RV64V-NEXT:    vand.vx v8, v8, a0
+; RV64V-NEXT:    vmul.vv v20, v12, v20
+; RV64V-NEXT:    vxor.vv v16, v16, v20
+; RV64V-NEXT:    vmul.vv v8, v12, v8
+; RV64V-NEXT:    vxor.vv v12, v16, v8
+; RV64V-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64V-NEXT:    vnsrl.wi v8, v12, 16
+; RV64V-NEXT:    ret
+;
+; RV32ZVBC-LABEL: clmulh_nxv8i16_vx:
+; RV32ZVBC:       # %bb.0:
+; RV32ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vmv.v.x v24, a0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV32ZVBC-NEXT:    vzext.vf4 v8, v24
+; RV32ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV32ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32ZVBC-NEXT:    vnsrl.wi v8, v16, 16
+; RV32ZVBC-NEXT:    ret
+;
+; RV64ZVBC-LABEL: clmulh_nxv8i16_vx:
+; RV64ZVBC:       # %bb.0:
+; RV64ZVBC-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vmv.v.x v24, a0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64ZVBC-NEXT:    vzext.vf4 v16, v8
+; RV64ZVBC-NEXT:    vzext.vf4 v8, v24
+; RV64ZVBC-NEXT:    vclmul.vv v8, v16, v8
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v16, v8, 0
+; RV64ZVBC-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVBC-NEXT:    vnsrl.wi v8, v16, 16
+; RV64ZVBC-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
   %va.ext = zext <vscale x 8 x i16> %va to <vscale x 8 x i32>

>From 92cfb463abb3ab2370b80e09e24077e2a098156a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 2 Mar 2026 20:38:50 -0800
Subject: [PATCH 3/3] fixup! add TODO for Zvbc32e.

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2bdffad2ded26..396bb5ab79221 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1117,6 +1117,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       }
 
       if (Subtarget.hasStdExtZvbc() && Subtarget.hasVInstructionsI64()) {
+        // TODO: Support Zvbc32e.
         if (VT.getVectorElementType() == MVT::i64)
           setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
         else {



More information about the llvm-commits mailing list