[llvm] [ISel/RISCV] Scalarize small pow-of-2 VECREDUCE_(AND|OR|XOR) (PR #182631)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 21 01:53:04 PST 2026
https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/182631
>From 058ecf36707a199d2b875adc94b326c5061510b7 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon at tenstorrent.com>
Date: Thu, 19 Feb 2026 16:23:04 +0000
Subject: [PATCH 1/2] [ISel/RISCV] Scalarize small pow-of-2
VECREDUCE_(AND|OR|XOR)
The logic achieves what the AArch64 and X86 targets already do.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 31 +
.../rvv/fixed-vectors-int-explodevector.ll | 872 ++++++++++++------
.../rvv/fixed-vectors-reduction-formation.ll | 65 +-
.../RISCV/rvv/fixed-vectors-reduction-int.ll | 438 ++++++---
4 files changed, 956 insertions(+), 450 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5f09ece63d3e5..4d369c421e3f6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12291,6 +12291,37 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
MVT VecVT = VecEVT.getSimpleVT();
MVT VecEltVT = VecVT.getVectorElementType();
+
+ // Scalarize vecreduce_(and|or|xor) for fixed-vector pow-of-2 types, if the
+ // entire vector fits into a scalar.
+ if (is_contained({ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR},
+ Op.getOpcode()) &&
+ VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
+ VecVT.bitsLE(Subtarget.getXLenVT())) {
+ auto GetArithOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::VECREDUCE_AND:
+ return ISD::AND;
+ case ISD::VECREDUCE_OR:
+ return ISD::OR;
+ case ISD::VECREDUCE_XOR:
+ return ISD::XOR;
+ }
+ llvm_unreachable("Unexpected opcode");
+ };
+ ISD::NodeType ArithOpcode = GetArithOpcode(Op.getOpcode());
+ unsigned NumElts = VecVT.getVectorNumElements();
+ EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
+ SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
+ for (unsigned Shift = NumElts / 2; Shift > 0; Shift /= 2) {
+ SDValue ShiftAmt = DAG.getShiftAmountConstant(
+ Shift * VecEltVT.getSizeInBits(), ScalarVT, DL);
+ SDValue Shifted = DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmt);
+ Scalar = DAG.getNode(ArithOpcode, DL, ScalarVT, Scalar, Shifted);
+ }
+ return DAG.getAnyExtOrTrunc(Scalar, DL, Op.getValueType());
+ }
+
unsigned RVVOpcode = getRVVReductionOp(Op.getOpcode());
MVT ContainerVT = VecVT;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 719659823ed91..cf917da939241 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -3,13 +3,23 @@
; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
define i8 @explode_2xi8(<2 x i8> %v) {
-; CHECK-LABEL: explode_2xi8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_2xi8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a1, a0, 16
+; RV32-NEXT: srli a1, a1, 24
+; RV32-NEXT: xor a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_2xi8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%e0 = extractelement <2 x i8> %v, i32 0
%e1 = extractelement <2 x i8> %v, i32 1
%add0 = xor i8 %e0, %e1
@@ -17,20 +27,39 @@ define i8 @explode_2xi8(<2 x i8> %v) {
}
define i8 @explode_4xi8(<4 x i8> %v) {
-; CHECK-LABEL: explode_4xi8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vmv.x.s a1, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a2, v8
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, a2, a0
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_4xi8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: xor a0, a0, a3
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_4xi8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vmv.x.s a2, v8
+; RV64-NEXT: slli a3, a0, 48
+; RV64-NEXT: srli a3, a3, 56
+; RV64-NEXT: xor a0, a0, a3
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: ret
%e0 = extractelement <4 x i8> %v, i32 0
%e1 = extractelement <4 x i8> %v, i32 1
%e2 = extractelement <4 x i8> %v, i32 2
@@ -43,32 +72,63 @@ define i8 @explode_4xi8(<4 x i8> %v) {
define i8 @explode_8xi8(<8 x i8> %v) {
-; CHECK-LABEL: explode_8xi8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vmv.x.s a1, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vmv.x.s a2, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 5
-; CHECK-NEXT: vmv.x.s a3, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vmv.x.s a4, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 7
-; CHECK-NEXT: vmv.x.s a5, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a6, v8
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a6, a0
-; CHECK-NEXT: add a2, a2, a4
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a0, a0, a5
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_8xi8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 3
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vmv.x.s a2, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 5
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 6
+; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv.x.s a4, v8
+; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vmv.x.s a5, v9
+; RV32-NEXT: vmv.x.s a6, v8
+; RV32-NEXT: slli a7, a4, 16
+; RV32-NEXT: srli a7, a7, 24
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: xor a1, a4, a7
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a3, a3, a6
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_8xi8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 3
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 4
+; RV64-NEXT: vmv.x.s a2, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 5
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 6
+; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv.x.s a4, v8
+; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vmv.x.s a5, v9
+; RV64-NEXT: vmv.x.s a6, v8
+; RV64-NEXT: slli a7, a4, 48
+; RV64-NEXT: srli a7, a7, 56
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a3, a3, a5
+; RV64-NEXT: xor a1, a4, a7
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: add a3, a3, a6
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: ret
%e0 = extractelement <8 x i8> %v, i32 0
%e1 = extractelement <8 x i8> %v, i32 1
%e2 = extractelement <8 x i8> %v, i32 2
@@ -88,56 +148,127 @@ define i8 @explode_8xi8(<8 x i8> %v) {
}
define i8 @explode_16xi8(<16 x i8> %v) {
-; CHECK-LABEL: explode_16xi8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vmv.x.s a1, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vmv.x.s a2, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 5
-; CHECK-NEXT: vmv.x.s a3, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vmv.x.s a4, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 7
-; CHECK-NEXT: vmv.x.s a5, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 8
-; CHECK-NEXT: vmv.x.s a6, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 9
-; CHECK-NEXT: vmv.x.s a7, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 10
-; CHECK-NEXT: vmv.x.s t0, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 11
-; CHECK-NEXT: vmv.x.s t1, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 12
-; CHECK-NEXT: vmv.x.s t2, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 13
-; CHECK-NEXT: vmv.x.s t3, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 14
-; CHECK-NEXT: vmv.x.s t4, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 15
-; CHECK-NEXT: vmv.x.s t5, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s t6, v8
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a5, a5, a6
-; CHECK-NEXT: add t1, t1, t2
-; CHECK-NEXT: add a0, t6, a0
-; CHECK-NEXT: add a2, a2, a4
-; CHECK-NEXT: add a5, a5, a7
-; CHECK-NEXT: add t1, t1, t3
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a5, a5, t0
-; CHECK-NEXT: add t1, t1, t4
-; CHECK-NEXT: add a0, a0, a5
-; CHECK-NEXT: add t1, t1, t5
-; CHECK-NEXT: add a0, a0, t1
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_16xi8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 3
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vmv.x.s a2, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 5
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 6
+; RV32-NEXT: vmv.x.s a4, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 7
+; RV32-NEXT: vmv.x.s a5, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 8
+; RV32-NEXT: vmv.x.s a6, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 9
+; RV32-NEXT: vmv.x.s a7, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 10
+; RV32-NEXT: vmv.x.s t0, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 11
+; RV32-NEXT: vmv.x.s t1, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 12
+; RV32-NEXT: vmv.x.s t2, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 13
+; RV32-NEXT: vmv.x.s t3, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 14
+; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT: vmv.x.s t4, v8
+; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 15
+; RV32-NEXT: vmv.x.s t5, v9
+; RV32-NEXT: vmv.x.s t6, v8
+; RV32-NEXT: slli s0, t4, 16
+; RV32-NEXT: srli s0, s0, 24
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: xor a1, t4, s0
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: add a7, a7, t1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a3, a3, a6
+; RV32-NEXT: add a7, a7, t2
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a7, a7, t3
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: add t5, t5, t6
+; RV32-NEXT: add a0, a0, t5
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_16xi8:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 3
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 4
+; RV64-NEXT: vmv.x.s a2, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 5
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 6
+; RV64-NEXT: vmv.x.s a4, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 7
+; RV64-NEXT: vmv.x.s a5, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 8
+; RV64-NEXT: vmv.x.s a6, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 9
+; RV64-NEXT: vmv.x.s a7, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 10
+; RV64-NEXT: vmv.x.s t0, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 11
+; RV64-NEXT: vmv.x.s t1, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 12
+; RV64-NEXT: vmv.x.s t2, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 13
+; RV64-NEXT: vmv.x.s t3, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 14
+; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT: vmv.x.s t4, v8
+; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 15
+; RV64-NEXT: vmv.x.s t5, v9
+; RV64-NEXT: vmv.x.s t6, v8
+; RV64-NEXT: slli s0, t4, 48
+; RV64-NEXT: srli s0, s0, 56
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: xor a1, t4, s0
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a3, a3, a5
+; RV64-NEXT: add a7, a7, t1
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: add a3, a3, a6
+; RV64-NEXT: add a7, a7, t2
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a7, a7, t3
+; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: add t5, t5, t6
+; RV64-NEXT: add a0, a0, t5
+; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
%e0 = extractelement <16 x i8> %v, i32 0
%e1 = extractelement <16 x i8> %v, i32 1
%e2 = extractelement <16 x i8> %v, i32 2
@@ -173,13 +304,21 @@ define i8 @explode_16xi8(<16 x i8> %v) {
}
define i16 @explode_2xi16(<2 x i16> %v) {
-; CHECK-LABEL: explode_2xi16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_2xi16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: srli a1, a0, 16
+; RV32-NEXT: xor a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_2xi16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srliw a1, a0, 16
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%e0 = extractelement <2 x i16> %v, i32 0
%e1 = extractelement <2 x i16> %v, i32 1
%add0 = xor i16 %e0, %e1
@@ -187,20 +326,37 @@ define i16 @explode_2xi16(<2 x i16> %v) {
}
define i16 @explode_4xi16(<4 x i16> %v) {
-; CHECK-LABEL: explode_4xi16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vmv.x.s a1, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a2, v8
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, a2, a0
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_4xi16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: xor a0, a0, a3
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_4xi16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vmv.x.s a2, v8
+; RV64-NEXT: srliw a3, a0, 16
+; RV64-NEXT: xor a0, a0, a3
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: ret
%e0 = extractelement <4 x i16> %v, i32 0
%e1 = extractelement <4 x i16> %v, i32 1
%e2 = extractelement <4 x i16> %v, i32 2
@@ -213,32 +369,61 @@ define i16 @explode_4xi16(<4 x i16> %v) {
define i16 @explode_8xi16(<8 x i16> %v) {
-; CHECK-LABEL: explode_8xi16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a0, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vmv.x.s a1, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vmv.x.s a2, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 5
-; CHECK-NEXT: vmv.x.s a3, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vmv.x.s a4, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 7
-; CHECK-NEXT: vmv.x.s a5, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a6, v8
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a6, a0
-; CHECK-NEXT: add a2, a2, a4
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a0, a0, a5
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_8xi16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 3
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vmv.x.s a2, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 5
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 6
+; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT: vmv.x.s a4, v8
+; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vmv.x.s a5, v9
+; RV32-NEXT: vmv.x.s a6, v8
+; RV32-NEXT: srli a7, a4, 16
+; RV32-NEXT: xor a4, a4, a7
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a4, a0
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a0, a6
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_8xi16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 3
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 4
+; RV64-NEXT: vmv.x.s a2, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 5
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 6
+; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT: vmv.x.s a4, v8
+; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vmv.x.s a5, v9
+; RV64-NEXT: vmv.x.s a6, v8
+; RV64-NEXT: srliw a7, a4, 16
+; RV64-NEXT: xor a4, a4, a7
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a4, a0
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, a0, a6
+; RV64-NEXT: ret
%e0 = extractelement <8 x i16> %v, i32 0
%e1 = extractelement <8 x i16> %v, i32 1
%e2 = extractelement <8 x i16> %v, i32 2
@@ -258,57 +443,127 @@ define i16 @explode_8xi16(<8 x i16> %v) {
}
define i16 @explode_16xi16(<16 x i16> %v) {
-; CHECK-LABEL: explode_16xi16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 8
-; CHECK-NEXT: vmv.x.s a0, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 9
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 10
-; CHECK-NEXT: vmv.x.s a2, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 11
-; CHECK-NEXT: vmv.x.s a3, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 12
-; CHECK-NEXT: vmv.x.s a4, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 13
-; CHECK-NEXT: vmv.x.s a5, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 14
-; CHECK-NEXT: vmv.x.s a6, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 15
-; CHECK-NEXT: vmv.x.s a7, v10
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vslidedown.vi v10, v8, 3
-; CHECK-NEXT: vmv.x.s t0, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vmv.x.s t1, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 5
-; CHECK-NEXT: vmv.x.s t2, v9
-; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vmv.x.s t3, v10
-; CHECK-NEXT: vslidedown.vi v10, v8, 7
-; CHECK-NEXT: vmv.x.s t4, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vmv.x.s t5, v10
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s t6, v8
-; CHECK-NEXT: add t0, t0, t1
-; CHECK-NEXT: add t2, t2, t3
-; CHECK-NEXT: add a0, t5, a0
-; CHECK-NEXT: add a3, a3, a4
-; CHECK-NEXT: add t0, t6, t0
-; CHECK-NEXT: add t2, t2, t4
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a3, a3, a5
-; CHECK-NEXT: add t0, t0, t2
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a3, a3, a6
-; CHECK-NEXT: add a0, t0, a0
-; CHECK-NEXT: add a3, a3, a7
-; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_16xi16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 2
+; RV32-NEXT: vslidedown.vi v11, v8, 3
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 4
+; RV32-NEXT: vmv.x.s a1, v11
+; RV32-NEXT: vslidedown.vi v11, v8, 5
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 6
+; RV32-NEXT: vmv.x.s a3, v11
+; RV32-NEXT: vslidedown.vi v11, v8, 7
+; RV32-NEXT: vmv.x.s a4, v10
+; RV32-NEXT: vmv.x.s a5, v11
+; RV32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 8
+; RV32-NEXT: vmv.x.s a6, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 9
+; RV32-NEXT: vmv.x.s a7, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 10
+; RV32-NEXT: vmv.x.s t0, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 11
+; RV32-NEXT: vmv.x.s t1, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 12
+; RV32-NEXT: vmv.x.s t2, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 13
+; RV32-NEXT: vmv.x.s t3, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 14
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT: vmv.x.s t4, v8
+; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 15
+; RV32-NEXT: vmv.x.s t5, v10
+; RV32-NEXT: vmv.x.s t6, v8
+; RV32-NEXT: srli s0, t4, 16
+; RV32-NEXT: xor t4, t4, s0
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: add a0, t4, a0
+; RV32-NEXT: add a2, a2, a4
+; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: add t1, t1, t3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a5, a5, t0
+; RV32-NEXT: add t1, t1, t5
+; RV32-NEXT: add a0, a0, a5
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: add a0, a0, t1
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_16xi16:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vslidedown.vi v11, v8, 3
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 4
+; RV64-NEXT: vmv.x.s a1, v11
+; RV64-NEXT: vslidedown.vi v11, v8, 5
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 6
+; RV64-NEXT: vmv.x.s a3, v11
+; RV64-NEXT: vslidedown.vi v11, v8, 7
+; RV64-NEXT: vmv.x.s a4, v10
+; RV64-NEXT: vmv.x.s a5, v11
+; RV64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 8
+; RV64-NEXT: vmv.x.s a6, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 9
+; RV64-NEXT: vmv.x.s a7, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 10
+; RV64-NEXT: vmv.x.s t0, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 11
+; RV64-NEXT: vmv.x.s t1, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 12
+; RV64-NEXT: vmv.x.s t2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 13
+; RV64-NEXT: vmv.x.s t3, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 14
+; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT: vmv.x.s t4, v8
+; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 15
+; RV64-NEXT: vmv.x.s t5, v10
+; RV64-NEXT: vmv.x.s t6, v8
+; RV64-NEXT: srliw s0, t4, 16
+; RV64-NEXT: xor t4, t4, s0
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: add t1, t1, t2
+; RV64-NEXT: add a0, t4, a0
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: add a5, a5, a7
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a5, a5, t0
+; RV64-NEXT: add t1, t1, t5
+; RV64-NEXT: add a0, a0, a5
+; RV64-NEXT: add t1, t1, t6
+; RV64-NEXT: add a0, a0, t1
+; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
%e0 = extractelement <16 x i16> %v, i32 0
%e1 = extractelement <16 x i16> %v, i32 1
%e2 = extractelement <16 x i16> %v, i32 2
@@ -344,13 +599,21 @@ define i16 @explode_16xi16(<16 x i16> %v) {
}
define i32 @explode_2xi32(<2 x i32> %v) {
-; CHECK-LABEL: explode_2xi32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: explode_2xi32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_2xi32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%e0 = extractelement <2 x i32> %v, i32 0
%e1 = extractelement <2 x i32> %v, i32 1
%add0 = xor i32 %e0, %e1
@@ -377,15 +640,16 @@ define i32 @explode_4xi32(<4 x i32> %v) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vmv.x.s a0, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 3
+; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vmv.s.x v9, zero
-; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT: vredxor.vs v8, v8, v9
; RV64-NEXT: vmv.x.s a2, v8
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: addw a0, a2, a0
+; RV64-NEXT: srli a3, a0, 32
+; RV64-NEXT: xor a0, a0, a3
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: addw a0, a0, a1
; RV64-NEXT: ret
%e0 = extractelement <4 x i32> %v, i32 0
%e1 = extractelement <4 x i32> %v, i32 1
@@ -429,30 +693,31 @@ define i32 @explode_8xi32(<8 x i32> %v) {
;
; RV64-LABEL: explode_8xi32:
; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vslidedown.vi v11, v8, 3
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: vmv.x.s a1, v11
; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64-NEXT: vslidedown.vi v10, v8, 4
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 5
-; RV64-NEXT: vmv.x.s a1, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 6
; RV64-NEXT: vmv.x.s a2, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 7
+; RV64-NEXT: vslidedown.vi v10, v8, 5
; RV64-NEXT: vmv.x.s a3, v10
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vslidedown.vi v10, v8, 3
-; RV64-NEXT: vmv.x.s a4, v9
-; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vslidedown.vi v10, v8, 6
+; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.x.s a4, v8
+; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 7
; RV64-NEXT: vmv.x.s a5, v10
-; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT: vredxor.vs v8, v8, v9
; RV64-NEXT: vmv.x.s a6, v8
-; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: srli a7, a4, 32
+; RV64-NEXT: xor a4, a4, a7
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a4, a6, a4
-; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: add a0, a4, a0
-; RV64-NEXT: addw a0, a0, a3
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: addw a0, a0, a6
; RV64-NEXT: ret
%e0 = extractelement <8 x i32> %v, i32 0
%e1 = extractelement <8 x i32> %v, i32 1
@@ -543,60 +808,64 @@ define i32 @explode_16xi32(<16 x i32> %v) {
; RV64-NEXT: .cfi_def_cfa_offset 128
; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 104(sp) # 8-byte Folded Spill
; RV64-NEXT: .cfi_offset ra, -8
; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: .cfi_offset s1, -24
; RV64-NEXT: addi s0, sp, 128
; RV64-NEXT: .cfi_def_cfa s0, 0
; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vslidedown.vi v13, v8, 3
+; RV64-NEXT: vmv.x.s a0, v12
+; RV64-NEXT: vmv.x.s a1, v13
; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64-NEXT: vslidedown.vi v12, v8, 4
-; RV64-NEXT: vmv.x.s a0, v12
+; RV64-NEXT: vmv.x.s a2, v12
; RV64-NEXT: vslidedown.vi v12, v8, 5
-; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vmv.x.s a3, v12
; RV64-NEXT: vslidedown.vi v12, v8, 6
-; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vmv.x.s a4, v12
; RV64-NEXT: vslidedown.vi v12, v8, 7
-; RV64-NEXT: vmv.x.s a3, v12
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vslidedown.vi v13, v8, 3
-; RV64-NEXT: mv a4, sp
-; RV64-NEXT: vmv.x.s a5, v12
-; RV64-NEXT: vmv.s.x v12, zero
-; RV64-NEXT: vmv.x.s a6, v13
+; RV64-NEXT: mv a5, sp
+; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.x.s a6, v8
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT: vse32.v v8, (a4)
-; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT: vredxor.vs v8, v8, v12
-; RV64-NEXT: lw a4, 32(sp)
-; RV64-NEXT: lw a7, 36(sp)
-; RV64-NEXT: lw t0, 40(sp)
-; RV64-NEXT: lw t1, 44(sp)
-; RV64-NEXT: lw t2, 48(sp)
-; RV64-NEXT: lw t3, 52(sp)
-; RV64-NEXT: lw t4, 56(sp)
-; RV64-NEXT: lw t5, 60(sp)
-; RV64-NEXT: vmv.x.s t6, v8
-; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: vmv.x.s a7, v12
+; RV64-NEXT: vse32.v v8, (a5)
+; RV64-NEXT: srli a5, a6, 32
+; RV64-NEXT: lw t0, 32(sp)
+; RV64-NEXT: lw t1, 36(sp)
+; RV64-NEXT: lw t2, 40(sp)
+; RV64-NEXT: lw t3, 44(sp)
+; RV64-NEXT: lw t4, 48(sp)
+; RV64-NEXT: lw t5, 52(sp)
+; RV64-NEXT: lw t6, 56(sp)
+; RV64-NEXT: lw s1, 60(sp)
+; RV64-NEXT: xor a5, a6, a5
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a5, t6, a5
-; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: add a0, a0, a2
; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: add t2, t2, t3
-; RV64-NEXT: add a0, a0, a3
-; RV64-NEXT: add a7, a7, t1
-; RV64-NEXT: add t2, t2, t4
+; RV64-NEXT: add t1, t1, t2
+; RV64-NEXT: add t4, t4, t5
; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add t2, t2, t5
-; RV64-NEXT: addw a0, a0, t2
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add t4, t4, t6
+; RV64-NEXT: add a0, a0, t1
+; RV64-NEXT: add t4, t4, s1
+; RV64-NEXT: addw a0, a0, t4
; RV64-NEXT: addi sp, s0, -128
; RV64-NEXT: .cfi_def_cfa sp, 128
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 104(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore ra
; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: addi sp, sp, 128
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
@@ -1172,50 +1441,59 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
;
; RV64-LABEL: explode_16xi32_exact_vlen:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset s0, -8
; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vmv.x.s a0, v12
-; RV64-NEXT: vslidedown.vi v12, v8, 3
-; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vmv.x.s a2, v12
-; RV64-NEXT: vslidedown.vi v12, v9, 1
-; RV64-NEXT: vmv.x.s a3, v12
-; RV64-NEXT: vslidedown.vi v12, v9, 2
-; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v13, v8, 3
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: vslidedown.vi v14, v9, 1
+; RV64-NEXT: vslidedown.vi v15, v9, 2
; RV64-NEXT: vslidedown.vi v9, v9, 3
-; RV64-NEXT: vmv.x.s a5, v10
-; RV64-NEXT: vmv.x.s a6, v9
-; RV64-NEXT: vslidedown.vi v9, v10, 1
-; RV64-NEXT: vmv.x.s a7, v9
-; RV64-NEXT: vslidedown.vi v9, v10, 2
+; RV64-NEXT: vmv.x.s a1, v10
+; RV64-NEXT: vslidedown.vi v16, v10, 1
+; RV64-NEXT: vslidedown.vi v17, v10, 2
+; RV64-NEXT: vslidedown.vi v10, v10, 3
+; RV64-NEXT: vmv.x.s a2, v11
+; RV64-NEXT: vslidedown.vi v18, v11, 1
+; RV64-NEXT: vslidedown.vi v19, v11, 2
+; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.x.s a3, v8
+; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v11, 3
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vmv.x.s a5, v13
+; RV64-NEXT: vmv.x.s a6, v14
+; RV64-NEXT: vmv.x.s a7, v15
; RV64-NEXT: vmv.x.s t0, v9
-; RV64-NEXT: vslidedown.vi v9, v10, 3
-; RV64-NEXT: vmv.x.s t1, v11
-; RV64-NEXT: vmv.x.s t2, v9
-; RV64-NEXT: vslidedown.vi v9, v11, 1
-; RV64-NEXT: vmv.x.s t3, v9
-; RV64-NEXT: vslidedown.vi v9, v11, 2
-; RV64-NEXT: vmv.x.s t4, v9
-; RV64-NEXT: vslidedown.vi v9, v11, 3
-; RV64-NEXT: vmv.x.s t5, v9
-; RV64-NEXT: vmv.s.x v9, zero
-; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s t1, v16
+; RV64-NEXT: vmv.x.s t2, v17
+; RV64-NEXT: vmv.x.s t3, v10
+; RV64-NEXT: vmv.x.s t4, v18
+; RV64-NEXT: vmv.x.s t5, v19
; RV64-NEXT: vmv.x.s t6, v8
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a1, a1, a3
-; RV64-NEXT: add a5, a6, a5
-; RV64-NEXT: add t1, t2, t1
-; RV64-NEXT: add a0, t6, a0
-; RV64-NEXT: add a1, a1, a4
-; RV64-NEXT: add a5, a5, a7
-; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: srli s0, a3, 32
+; RV64-NEXT: xor a3, a3, s0
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a0, a0, a6
+; RV64-NEXT: add a1, t0, a1
+; RV64-NEXT: add a2, t3, a2
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: add a1, a1, t1
+; RV64-NEXT: add a2, a2, t4
+; RV64-NEXT: add a0, a3, a0
+; RV64-NEXT: add a1, a1, t2
+; RV64-NEXT: add a2, a2, t5
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a5, a5, t0
-; RV64-NEXT: add t1, t1, t4
-; RV64-NEXT: add a0, a0, a5
-; RV64-NEXT: add t1, t1, t5
-; RV64-NEXT: addw a0, a0, t1
+; RV64-NEXT: add a2, a2, t6
+; RV64-NEXT: addw a0, a0, a2
+; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
@@ -1250,3 +1528,5 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
%add14 = add i32 %add13, %e15
ret i32 %add14
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 4da6e103603ce..345e6e2183426 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -472,14 +472,21 @@ define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
;; makes sure that other opcodes work as expected.
define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_xor_16xi32_prefix2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: reduce_xor_16xi32_prefix2:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_xor_16xi32_prefix2:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
@@ -510,13 +517,20 @@ define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
}
define i32 @reduce_and_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_and_16xi32_prefix2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: reduce_and_16xi32_prefix2:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_and_16xi32_prefix2:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
@@ -549,13 +563,20 @@ define i32 @reduce_and_16xi32_prefix5(ptr %p) {
}
define i32 @reduce_or_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_or_16xi32_prefix2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: reduce_or_16xi32_prefix2:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_or_16xi32_prefix2:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: ret
%v = load <16 x i32>, ptr %p, align 256
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 9725bb37c679b..f308ee9eaaf3a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -1592,13 +1592,25 @@ define i8 @vreduce_and_v1i8(<1 x i8> %v) {
}
define i8 @vreduce_and_v2i8(ptr %x) {
-; CHECK-LABEL: vreduce_and_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a1, a0, 16
+; RV32-NEXT: srli a1, a1, 24
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vle8.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v)
ret i8 %red
@@ -1620,26 +1632,49 @@ define i8 @vreduce_and_v3i8(ptr %x) {
}
define i8 @vreduce_and_v4i8(ptr %x) {
-; CHECK-LABEL: vreduce_and_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: srli a1, a0, 16
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: srli a1, a0, 8
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vle8.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srliw a1, a0, 16
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: srli a1, a0, 8
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
%v = load <4 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
ret i8 %red
}
define i8 @vreduce_and_v8i8(ptr %x) {
-; CHECK-LABEL: vreduce_and_v8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_v8i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_v8i8:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: srli a1, a0, 16
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: srli a1, a0, 8
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
%v = load <8 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v)
ret i8 %red
@@ -1728,26 +1763,43 @@ define i16 @vreduce_and_v1i16(<1 x i16> %v) {
}
define i16 @vreduce_and_v2i16(ptr %x) {
-; CHECK-LABEL: vreduce_and_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: srli a1, a0, 16
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vle16.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srliw a1, a0, 16
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v)
ret i16 %red
}
define i16 @vreduce_and_v4i16(ptr %x) {
-; CHECK-LABEL: vreduce_and_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT: vle16.v v8, (a0)
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: srli a1, a0, 16
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
%v = load <4 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v)
ret i16 %red
@@ -1835,13 +1887,20 @@ define i32 @vreduce_and_v1i32(<1 x i32> %v) {
}
define i32 @vreduce_and_v2i32(ptr %x) {
-; CHECK-LABEL: vreduce_and_v2i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_and_v2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vredand.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_and_v2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v)
ret i32 %red
@@ -2121,13 +2180,25 @@ define i8 @vreduce_or_v1i8(<1 x i8> %v) {
}
define i8 @vreduce_or_v2i8(ptr %x) {
-; CHECK-LABEL: vreduce_or_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a1, a0, 16
+; RV32-NEXT: srli a1, a1, 24
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vle8.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v)
ret i8 %red
@@ -2148,26 +2219,49 @@ define i8 @vreduce_or_v3i8(ptr %x) {
}
define i8 @vreduce_or_v4i8(ptr %x) {
-; CHECK-LABEL: vreduce_or_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: srli a1, a0, 16
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: srli a1, a0, 8
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vle8.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srliw a1, a0, 16
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: srliw a1, a0, 8
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: ret
%v = load <4 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
ret i8 %red
}
define i8 @vreduce_or_v8i8(ptr %x) {
-; CHECK-LABEL: vreduce_or_v8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_v8i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_v8i8:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: srli a1, a0, 16
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: srli a1, a0, 8
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: ret
%v = load <8 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v)
ret i8 %red
@@ -2256,26 +2350,43 @@ define i16 @vreduce_or_v1i16(<1 x i16> %v) {
}
define i16 @vreduce_or_v2i16(ptr %x) {
-; CHECK-LABEL: vreduce_or_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: srli a1, a0, 16
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vle16.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srliw a1, a0, 16
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v)
ret i16 %red
}
define i16 @vreduce_or_v4i16(ptr %x) {
-; CHECK-LABEL: vreduce_or_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT: vle16.v v8, (a0)
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: srli a1, a0, 16
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: ret
%v = load <4 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v)
ret i16 %red
@@ -2363,13 +2474,20 @@ define i32 @vreduce_or_v1i32(<1 x i32> %v) {
}
define i32 @vreduce_or_v2i32(ptr %x) {
-; CHECK-LABEL: vreduce_or_v2i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_or_v2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vredor.vs v8, v8, v8
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_or_v2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v)
ret i32 %red
@@ -2649,14 +2767,25 @@ define i8 @vreduce_xor_v1i8(<1 x i8> %v) {
}
define i8 @vreduce_xor_v2i8(ptr %x) {
-; CHECK-LABEL: vreduce_xor_v2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: slli a1, a0, 16
+; RV32-NEXT: srli a1, a1, 24
+; RV32-NEXT: xor a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vle8.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %v)
ret i8 %red
@@ -2677,28 +2806,50 @@ define i8 @vreduce_xor_v3i8(ptr %x) {
}
define i8 @vreduce_xor_v4i8(ptr %x) {
-; CHECK-LABEL: vreduce_xor_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: srli a1, a0, 16
+; RV32-NEXT: xor a0, a0, a1
+; RV32-NEXT: srli a1, a0, 8
+; RV32-NEXT: xor a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vle8.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srliw a1, a0, 16
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: srliw a1, a0, 8
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%v = load <4 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %v)
ret i8 %red
}
define i8 @vreduce_xor_v8i8(ptr %x) {
-; CHECK-LABEL: vreduce_xor_v8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_v8i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_v8i8:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: srli a1, a0, 16
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: srli a1, a0, 8
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%v = load <8 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v)
ret i8 %red
@@ -2792,28 +2943,44 @@ define i16 @vreduce_xor_v1i16(<1 x i16> %v) {
}
define i16 @vreduce_xor_v2i16(ptr %x) {
-; CHECK-LABEL: vreduce_xor_v2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: srli a1, a0, 16
+; RV32-NEXT: xor a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vle16.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: srliw a1, a0, 16
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %v)
ret i16 %red
}
define i16 @vreduce_xor_v4i16(ptr %x) {
-; CHECK-LABEL: vreduce_xor_v4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT: vle16.v v8, (a0)
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: srli a1, a0, 16
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%v = load <4 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v)
ret i16 %red
@@ -2906,14 +3073,21 @@ define i32 @vreduce_xor_v1i32(<1 x i32> %v) {
}
define i32 @vreduce_xor_v2i32(ptr %x) {
-; CHECK-LABEL: vreduce_xor_v2i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: ret
+; RV32-LABEL: vreduce_xor_v2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_xor_v2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: srli a1, a0, 32
+; RV64-NEXT: xor a0, a0, a1
+; RV64-NEXT: ret
%v = load <2 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v)
ret i32 %red
>From 522b91ef5fc406ef2dd2323ad7956271f91aec9c Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon at tenstorrent.com>
Date: Sat, 21 Feb 2026 09:15:59 +0000
Subject: [PATCH 2/2] [ISel/RISCV] Restrict to loaded values so bitcast is free
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 +-
.../rvv/fixed-vectors-int-explodevector.ll | 872 ++++++------------
2 files changed, 300 insertions(+), 579 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4d369c421e3f6..7336c766e5154 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12292,9 +12292,10 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
MVT VecVT = VecEVT.getSimpleVT();
MVT VecEltVT = VecVT.getVectorElementType();
- // Scalarize vecreduce_(and|or|xor) for fixed-vector pow-of-2 types, if the
- // entire vector fits into a scalar.
- if (is_contained({ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR},
+ // Scalarize vecreduce_(and|or|xor) for loaded values of fixed-vector pow-of-2
+ // types, if the entire vector fits into a scalar.
+ if (isa<LoadSDNode>(Vec) &&
+ is_contained({ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR},
Op.getOpcode()) &&
VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
VecVT.bitsLE(Subtarget.getXLenVT())) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index cf917da939241..719659823ed91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -3,23 +3,13 @@
; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
define i8 @explode_2xi8(<2 x i8> %v) {
-; RV32-LABEL: explode_2xi8:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: slli a1, a0, 16
-; RV32-NEXT: srli a1, a1, 24
-; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_2xi8:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: slli a1, a0, 48
-; RV64-NEXT: srli a1, a1, 56
-; RV64-NEXT: xor a0, a0, a1
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_2xi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
%e0 = extractelement <2 x i8> %v, i32 0
%e1 = extractelement <2 x i8> %v, i32 1
%add0 = xor i8 %e0, %e1
@@ -27,39 +17,20 @@ define i8 @explode_2xi8(<2 x i8> %v) {
}
define i8 @explode_4xi8(<4 x i8> %v) {
-; RV32-LABEL: explode_4xi8:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a2, v8
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: srli a3, a3, 24
-; RV32-NEXT: xor a0, a0, a3
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_4xi8:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vmv.x.s a2, v8
-; RV64-NEXT: slli a3, a0, 48
-; RV64-NEXT: srli a3, a3, 56
-; RV64-NEXT: xor a0, a0, a3
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_4xi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a2, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a0, a2, a0
+; CHECK-NEXT: ret
%e0 = extractelement <4 x i8> %v, i32 0
%e1 = extractelement <4 x i8> %v, i32 1
%e2 = extractelement <4 x i8> %v, i32 2
@@ -72,63 +43,32 @@ define i8 @explode_4xi8(<4 x i8> %v) {
define i8 @explode_8xi8(<8 x i8> %v) {
-; RV32-LABEL: explode_8xi8:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: vmv.x.s a0, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 3
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 4
-; RV32-NEXT: vmv.x.s a2, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 5
-; RV32-NEXT: vmv.x.s a3, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 6
-; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT: vmv.x.s a4, v8
-; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 7
-; RV32-NEXT: vmv.x.s a5, v9
-; RV32-NEXT: vmv.x.s a6, v8
-; RV32-NEXT: slli a7, a4, 16
-; RV32-NEXT: srli a7, a7, 24
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a3, a3, a5
-; RV32-NEXT: xor a1, a4, a7
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add a3, a3, a6
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_8xi8:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vmv.x.s a0, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 3
-; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 4
-; RV64-NEXT: vmv.x.s a2, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 5
-; RV64-NEXT: vmv.x.s a3, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 6
-; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; RV64-NEXT: vmv.x.s a4, v8
-; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 7
-; RV64-NEXT: vmv.x.s a5, v9
-; RV64-NEXT: vmv.x.s a6, v8
-; RV64-NEXT: slli a7, a4, 48
-; RV64-NEXT: srli a7, a7, 56
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a3, a3, a5
-; RV64-NEXT: xor a1, a4, a7
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: add a3, a3, a6
-; RV64-NEXT: add a0, a0, a3
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_8xi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 4
+; CHECK-NEXT: vmv.x.s a2, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 5
+; CHECK-NEXT: vmv.x.s a3, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 6
+; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 7
+; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a6, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: add a0, a6, a0
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a0, a0, a5
+; CHECK-NEXT: ret
%e0 = extractelement <8 x i8> %v, i32 0
%e1 = extractelement <8 x i8> %v, i32 1
%e2 = extractelement <8 x i8> %v, i32 2
@@ -148,127 +88,56 @@ define i8 @explode_8xi8(<8 x i8> %v) {
}
define i8 @explode_16xi8(<16 x i8> %v) {
-; RV32-LABEL: explode_16xi8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: vmv.x.s a0, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 3
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 4
-; RV32-NEXT: vmv.x.s a2, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 5
-; RV32-NEXT: vmv.x.s a3, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 6
-; RV32-NEXT: vmv.x.s a4, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 7
-; RV32-NEXT: vmv.x.s a5, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 8
-; RV32-NEXT: vmv.x.s a6, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 9
-; RV32-NEXT: vmv.x.s a7, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 10
-; RV32-NEXT: vmv.x.s t0, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 11
-; RV32-NEXT: vmv.x.s t1, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 12
-; RV32-NEXT: vmv.x.s t2, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 13
-; RV32-NEXT: vmv.x.s t3, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 14
-; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT: vmv.x.s t4, v8
-; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 15
-; RV32-NEXT: vmv.x.s t5, v9
-; RV32-NEXT: vmv.x.s t6, v8
-; RV32-NEXT: slli s0, t4, 16
-; RV32-NEXT: srli s0, s0, 24
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: xor a1, t4, s0
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a3, a3, a5
-; RV32-NEXT: add a7, a7, t1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add a3, a3, a6
-; RV32-NEXT: add a7, a7, t2
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: add a7, a7, t3
-; RV32-NEXT: add a0, a0, a7
-; RV32-NEXT: add t5, t5, t6
-; RV32-NEXT: add a0, a0, t5
-; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_16xi8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset s0, -8
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vmv.x.s a0, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 3
-; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 4
-; RV64-NEXT: vmv.x.s a2, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 5
-; RV64-NEXT: vmv.x.s a3, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 6
-; RV64-NEXT: vmv.x.s a4, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 7
-; RV64-NEXT: vmv.x.s a5, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 8
-; RV64-NEXT: vmv.x.s a6, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 9
-; RV64-NEXT: vmv.x.s a7, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 10
-; RV64-NEXT: vmv.x.s t0, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 11
-; RV64-NEXT: vmv.x.s t1, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 12
-; RV64-NEXT: vmv.x.s t2, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 13
-; RV64-NEXT: vmv.x.s t3, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 14
-; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT: vmv.x.s t4, v8
-; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 15
-; RV64-NEXT: vmv.x.s t5, v9
-; RV64-NEXT: vmv.x.s t6, v8
-; RV64-NEXT: slli s0, t4, 48
-; RV64-NEXT: srli s0, s0, 56
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: xor a1, t4, s0
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a3, a3, a5
-; RV64-NEXT: add a7, a7, t1
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: add a3, a3, a6
-; RV64-NEXT: add a7, a7, t2
-; RV64-NEXT: add a0, a0, a3
-; RV64-NEXT: add a7, a7, t3
-; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add t5, t5, t6
-; RV64-NEXT: add a0, a0, t5
-; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_16xi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 4
+; CHECK-NEXT: vmv.x.s a2, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 5
+; CHECK-NEXT: vmv.x.s a3, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 6
+; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 7
+; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 8
+; CHECK-NEXT: vmv.x.s a6, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 9
+; CHECK-NEXT: vmv.x.s a7, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 10
+; CHECK-NEXT: vmv.x.s t0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 11
+; CHECK-NEXT: vmv.x.s t1, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 12
+; CHECK-NEXT: vmv.x.s t2, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 13
+; CHECK-NEXT: vmv.x.s t3, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 14
+; CHECK-NEXT: vmv.x.s t4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 15
+; CHECK-NEXT: vmv.x.s t5, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s t6, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: add a5, a5, a6
+; CHECK-NEXT: add t1, t1, t2
+; CHECK-NEXT: add a0, t6, a0
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a5, a5, a7
+; CHECK-NEXT: add t1, t1, t3
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a5, a5, t0
+; CHECK-NEXT: add t1, t1, t4
+; CHECK-NEXT: add a0, a0, a5
+; CHECK-NEXT: add t1, t1, t5
+; CHECK-NEXT: add a0, a0, t1
+; CHECK-NEXT: ret
%e0 = extractelement <16 x i8> %v, i32 0
%e1 = extractelement <16 x i8> %v, i32 1
%e2 = extractelement <16 x i8> %v, i32 2
@@ -304,21 +173,13 @@ define i8 @explode_16xi8(<16 x i8> %v) {
}
define i16 @explode_2xi16(<2 x i16> %v) {
-; RV32-LABEL: explode_2xi16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: srli a1, a0, 16
-; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_2xi16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: srliw a1, a0, 16
-; RV64-NEXT: xor a0, a0, a1
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_2xi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
%e0 = extractelement <2 x i16> %v, i32 0
%e1 = extractelement <2 x i16> %v, i32 1
%add0 = xor i16 %e0, %e1
@@ -326,37 +187,20 @@ define i16 @explode_2xi16(<2 x i16> %v) {
}
define i16 @explode_4xi16(<4 x i16> %v) {
-; RV32-LABEL: explode_4xi16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a2, v8
-; RV32-NEXT: srli a3, a0, 16
-; RV32-NEXT: xor a0, a0, a3
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_4xi16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vmv.x.s a2, v8
-; RV64-NEXT: srliw a3, a0, 16
-; RV64-NEXT: xor a0, a0, a3
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_4xi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a2, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a0, a2, a0
+; CHECK-NEXT: ret
%e0 = extractelement <4 x i16> %v, i32 0
%e1 = extractelement <4 x i16> %v, i32 1
%e2 = extractelement <4 x i16> %v, i32 2
@@ -369,61 +213,32 @@ define i16 @explode_4xi16(<4 x i16> %v) {
define i16 @explode_8xi16(<8 x i16> %v) {
-; RV32-LABEL: explode_8xi16:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: vmv.x.s a0, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 3
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 4
-; RV32-NEXT: vmv.x.s a2, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 5
-; RV32-NEXT: vmv.x.s a3, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 6
-; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT: vmv.x.s a4, v8
-; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 7
-; RV32-NEXT: vmv.x.s a5, v9
-; RV32-NEXT: vmv.x.s a6, v8
-; RV32-NEXT: srli a7, a4, 16
-; RV32-NEXT: xor a4, a4, a7
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a4, a0
-; RV32-NEXT: add a2, a2, a5
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a0, a0, a6
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_8xi16:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vmv.x.s a0, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 3
-; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 4
-; RV64-NEXT: vmv.x.s a2, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 5
-; RV64-NEXT: vmv.x.s a3, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 6
-; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT: vmv.x.s a4, v8
-; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 7
-; RV64-NEXT: vmv.x.s a5, v9
-; RV64-NEXT: vmv.x.s a6, v8
-; RV64-NEXT: srliw a7, a4, 16
-; RV64-NEXT: xor a4, a4, a7
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a4, a0
-; RV64-NEXT: add a2, a2, a5
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a0, a0, a6
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_8xi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 4
+; CHECK-NEXT: vmv.x.s a2, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 5
+; CHECK-NEXT: vmv.x.s a3, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 6
+; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 7
+; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a6, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: add a0, a6, a0
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a0, a0, a5
+; CHECK-NEXT: ret
%e0 = extractelement <8 x i16> %v, i32 0
%e1 = extractelement <8 x i16> %v, i32 1
%e2 = extractelement <8 x i16> %v, i32 2
@@ -443,127 +258,57 @@ define i16 @explode_8xi16(<8 x i16> %v) {
}
define i16 @explode_16xi16(<16 x i16> %v) {
-; RV32-LABEL: explode_16xi16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: vslidedown.vi v11, v8, 3
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 4
-; RV32-NEXT: vmv.x.s a1, v11
-; RV32-NEXT: vslidedown.vi v11, v8, 5
-; RV32-NEXT: vmv.x.s a2, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 6
-; RV32-NEXT: vmv.x.s a3, v11
-; RV32-NEXT: vslidedown.vi v11, v8, 7
-; RV32-NEXT: vmv.x.s a4, v10
-; RV32-NEXT: vmv.x.s a5, v11
-; RV32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 8
-; RV32-NEXT: vmv.x.s a6, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 9
-; RV32-NEXT: vmv.x.s a7, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 10
-; RV32-NEXT: vmv.x.s t0, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 11
-; RV32-NEXT: vmv.x.s t1, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 12
-; RV32-NEXT: vmv.x.s t2, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 13
-; RV32-NEXT: vmv.x.s t3, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 14
-; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT: vmv.x.s t4, v8
-; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 15
-; RV32-NEXT: vmv.x.s t5, v10
-; RV32-NEXT: vmv.x.s t6, v8
-; RV32-NEXT: srli s0, t4, 16
-; RV32-NEXT: xor t4, t4, s0
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: add t1, t1, t2
-; RV32-NEXT: add a0, t4, a0
-; RV32-NEXT: add a2, a2, a4
-; RV32-NEXT: add a5, a5, a7
-; RV32-NEXT: add t1, t1, t3
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a5, a5, t0
-; RV32-NEXT: add t1, t1, t5
-; RV32-NEXT: add a0, a0, a5
-; RV32-NEXT: add t1, t1, t6
-; RV32-NEXT: add a0, a0, t1
-; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_16xi16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset s0, -8
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vslidedown.vi v11, v8, 3
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 4
-; RV64-NEXT: vmv.x.s a1, v11
-; RV64-NEXT: vslidedown.vi v11, v8, 5
-; RV64-NEXT: vmv.x.s a2, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 6
-; RV64-NEXT: vmv.x.s a3, v11
-; RV64-NEXT: vslidedown.vi v11, v8, 7
-; RV64-NEXT: vmv.x.s a4, v10
-; RV64-NEXT: vmv.x.s a5, v11
-; RV64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 8
-; RV64-NEXT: vmv.x.s a6, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 9
-; RV64-NEXT: vmv.x.s a7, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 10
-; RV64-NEXT: vmv.x.s t0, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 11
-; RV64-NEXT: vmv.x.s t1, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 12
-; RV64-NEXT: vmv.x.s t2, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 13
-; RV64-NEXT: vmv.x.s t3, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 14
-; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; RV64-NEXT: vmv.x.s t4, v8
-; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 15
-; RV64-NEXT: vmv.x.s t5, v10
-; RV64-NEXT: vmv.x.s t6, v8
-; RV64-NEXT: srliw s0, t4, 16
-; RV64-NEXT: xor t4, t4, s0
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: add t1, t1, t2
-; RV64-NEXT: add a0, t4, a0
-; RV64-NEXT: add a2, a2, a4
-; RV64-NEXT: add a5, a5, a7
-; RV64-NEXT: add t1, t1, t3
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a5, a5, t0
-; RV64-NEXT: add t1, t1, t5
-; RV64-NEXT: add a0, a0, a5
-; RV64-NEXT: add t1, t1, t6
-; RV64-NEXT: add a0, a0, t1
-; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_16xi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 8
+; CHECK-NEXT: vmv.x.s a0, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 9
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 10
+; CHECK-NEXT: vmv.x.s a2, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 11
+; CHECK-NEXT: vmv.x.s a3, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 12
+; CHECK-NEXT: vmv.x.s a4, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 13
+; CHECK-NEXT: vmv.x.s a5, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 14
+; CHECK-NEXT: vmv.x.s a6, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 15
+; CHECK-NEXT: vmv.x.s a7, v10
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vslidedown.vi v10, v8, 3
+; CHECK-NEXT: vmv.x.s t0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 4
+; CHECK-NEXT: vmv.x.s t1, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 5
+; CHECK-NEXT: vmv.x.s t2, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 6
+; CHECK-NEXT: vmv.x.s t3, v10
+; CHECK-NEXT: vslidedown.vi v10, v8, 7
+; CHECK-NEXT: vmv.x.s t4, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vmv.x.s t5, v10
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s t6, v8
+; CHECK-NEXT: add t0, t0, t1
+; CHECK-NEXT: add t2, t2, t3
+; CHECK-NEXT: add a0, t5, a0
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: add t0, t6, t0
+; CHECK-NEXT: add t2, t2, t4
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a3, a3, a5
+; CHECK-NEXT: add t0, t0, t2
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a3, a3, a6
+; CHECK-NEXT: add a0, t0, a0
+; CHECK-NEXT: add a3, a3, a7
+; CHECK-NEXT: add a0, a0, a3
+; CHECK-NEXT: ret
%e0 = extractelement <16 x i16> %v, i32 0
%e1 = extractelement <16 x i16> %v, i32 1
%e2 = extractelement <16 x i16> %v, i32 2
@@ -599,21 +344,13 @@ define i16 @explode_16xi16(<16 x i16> %v) {
}
define i32 @explode_2xi32(<2 x i32> %v) {
-; RV32-LABEL: explode_2xi32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT: vmv.s.x v9, zero
-; RV32-NEXT: vredxor.vs v8, v8, v9
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_2xi32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: srli a1, a0, 32
-; RV64-NEXT: xor a0, a0, a1
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_2xi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
%e0 = extractelement <2 x i32> %v, i32 0
%e1 = extractelement <2 x i32> %v, i32 1
%add0 = xor i32 %e0, %e1
@@ -640,16 +377,15 @@ define i32 @explode_4xi32(<4 x i32> %v) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vmv.x.s a0, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 3
; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v8, v8, v9
; RV64-NEXT: vmv.x.s a2, v8
-; RV64-NEXT: srli a3, a0, 32
-; RV64-NEXT: xor a0, a0, a3
-; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: addw a0, a0, a1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: addw a0, a2, a0
; RV64-NEXT: ret
%e0 = extractelement <4 x i32> %v, i32 0
%e1 = extractelement <4 x i32> %v, i32 1
@@ -693,31 +429,30 @@ define i32 @explode_8xi32(<8 x i32> %v) {
;
; RV64-LABEL: explode_8xi32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vslidedown.vi v11, v8, 3
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: vmv.x.s a1, v11
; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64-NEXT: vslidedown.vi v10, v8, 4
-; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vmv.x.s a0, v10
; RV64-NEXT: vslidedown.vi v10, v8, 5
-; RV64-NEXT: vmv.x.s a3, v10
+; RV64-NEXT: vmv.x.s a1, v10
; RV64-NEXT: vslidedown.vi v10, v8, 6
-; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; RV64-NEXT: vmv.x.s a4, v8
-; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 7
+; RV64-NEXT: vmv.x.s a3, v10
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vslidedown.vi v10, v8, 3
+; RV64-NEXT: vmv.x.s a4, v9
+; RV64-NEXT: vmv.s.x v9, zero
; RV64-NEXT: vmv.x.s a5, v10
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v8, v8, v9
; RV64-NEXT: vmv.x.s a6, v8
-; RV64-NEXT: srli a7, a4, 32
-; RV64-NEXT: xor a4, a4, a7
+; RV64-NEXT: add a4, a4, a5
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a4, a0
-; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: addw a0, a0, a6
+; RV64-NEXT: add a0, a4, a0
+; RV64-NEXT: addw a0, a0, a3
; RV64-NEXT: ret
%e0 = extractelement <8 x i32> %v, i32 0
%e1 = extractelement <8 x i32> %v, i32 1
@@ -808,64 +543,60 @@ define i32 @explode_16xi32(<16 x i32> %v) {
; RV64-NEXT: .cfi_def_cfa_offset 128
; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 104(sp) # 8-byte Folded Spill
; RV64-NEXT: .cfi_offset ra, -8
; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
; RV64-NEXT: addi s0, sp, 128
; RV64-NEXT: .cfi_def_cfa s0, 0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vslidedown.vi v13, v8, 3
-; RV64-NEXT: vmv.x.s a0, v12
-; RV64-NEXT: vmv.x.s a1, v13
; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64-NEXT: vslidedown.vi v12, v8, 4
-; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vmv.x.s a0, v12
; RV64-NEXT: vslidedown.vi v12, v8, 5
-; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vmv.x.s a1, v12
; RV64-NEXT: vslidedown.vi v12, v8, 6
-; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vmv.x.s a2, v12
; RV64-NEXT: vslidedown.vi v12, v8, 7
-; RV64-NEXT: mv a5, sp
-; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; RV64-NEXT: vmv.x.s a6, v8
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vslidedown.vi v13, v8, 3
+; RV64-NEXT: mv a4, sp
+; RV64-NEXT: vmv.x.s a5, v12
+; RV64-NEXT: vmv.s.x v12, zero
+; RV64-NEXT: vmv.x.s a6, v13
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT: vmv.x.s a7, v12
-; RV64-NEXT: vse32.v v8, (a5)
-; RV64-NEXT: srli a5, a6, 32
-; RV64-NEXT: lw t0, 32(sp)
-; RV64-NEXT: lw t1, 36(sp)
-; RV64-NEXT: lw t2, 40(sp)
-; RV64-NEXT: lw t3, 44(sp)
-; RV64-NEXT: lw t4, 48(sp)
-; RV64-NEXT: lw t5, 52(sp)
-; RV64-NEXT: lw t6, 56(sp)
-; RV64-NEXT: lw s1, 60(sp)
-; RV64-NEXT: xor a5, a6, a5
+; RV64-NEXT: vse32.v v8, (a4)
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v8, v8, v12
+; RV64-NEXT: lw a4, 32(sp)
+; RV64-NEXT: lw a7, 36(sp)
+; RV64-NEXT: lw t0, 40(sp)
+; RV64-NEXT: lw t1, 44(sp)
+; RV64-NEXT: lw t2, 48(sp)
+; RV64-NEXT: lw t3, 52(sp)
+; RV64-NEXT: lw t4, 56(sp)
+; RV64-NEXT: lw t5, 60(sp)
+; RV64-NEXT: vmv.x.s t6, v8
+; RV64-NEXT: add a5, a5, a6
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a5, a0
-; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: add a5, t6, a5
; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, a5, a0
+; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: add t1, t1, t2
-; RV64-NEXT: add t4, t4, t5
+; RV64-NEXT: add t2, t2, t3
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a7, a7, t1
+; RV64-NEXT: add t2, t2, t4
; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add t1, t1, t3
-; RV64-NEXT: add t4, t4, t6
-; RV64-NEXT: add a0, a0, t1
-; RV64-NEXT: add t4, t4, s1
-; RV64-NEXT: addw a0, a0, t4
+; RV64-NEXT: add t2, t2, t5
+; RV64-NEXT: addw a0, a0, t2
; RV64-NEXT: addi sp, s0, -128
; RV64-NEXT: .cfi_def_cfa sp, 128
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 104(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore ra
; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: addi sp, sp, 128
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
@@ -1441,59 +1172,50 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
;
; RV64-LABEL: explode_16xi32_exact_vlen:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset s0, -8
; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vslidedown.vi v13, v8, 3
-; RV64-NEXT: vmv.x.s a0, v9
-; RV64-NEXT: vslidedown.vi v14, v9, 1
-; RV64-NEXT: vslidedown.vi v15, v9, 2
-; RV64-NEXT: vslidedown.vi v9, v9, 3
-; RV64-NEXT: vmv.x.s a1, v10
-; RV64-NEXT: vslidedown.vi v16, v10, 1
-; RV64-NEXT: vslidedown.vi v17, v10, 2
-; RV64-NEXT: vslidedown.vi v10, v10, 3
-; RV64-NEXT: vmv.x.s a2, v11
-; RV64-NEXT: vslidedown.vi v18, v11, 1
-; RV64-NEXT: vslidedown.vi v19, v11, 2
-; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT: vmv.x.s a3, v8
-; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v11, 3
+; RV64-NEXT: vmv.x.s a0, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vslidedown.vi v12, v9, 1
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vslidedown.vi v12, v9, 2
; RV64-NEXT: vmv.x.s a4, v12
-; RV64-NEXT: vmv.x.s a5, v13
-; RV64-NEXT: vmv.x.s a6, v14
-; RV64-NEXT: vmv.x.s a7, v15
+; RV64-NEXT: vslidedown.vi v9, v9, 3
+; RV64-NEXT: vmv.x.s a5, v10
+; RV64-NEXT: vmv.x.s a6, v9
+; RV64-NEXT: vslidedown.vi v9, v10, 1
+; RV64-NEXT: vmv.x.s a7, v9
+; RV64-NEXT: vslidedown.vi v9, v10, 2
; RV64-NEXT: vmv.x.s t0, v9
-; RV64-NEXT: vmv.x.s t1, v16
-; RV64-NEXT: vmv.x.s t2, v17
-; RV64-NEXT: vmv.x.s t3, v10
-; RV64-NEXT: vmv.x.s t4, v18
-; RV64-NEXT: vmv.x.s t5, v19
+; RV64-NEXT: vslidedown.vi v9, v10, 3
+; RV64-NEXT: vmv.x.s t1, v11
+; RV64-NEXT: vmv.x.s t2, v9
+; RV64-NEXT: vslidedown.vi v9, v11, 1
+; RV64-NEXT: vmv.x.s t3, v9
+; RV64-NEXT: vslidedown.vi v9, v11, 2
+; RV64-NEXT: vmv.x.s t4, v9
+; RV64-NEXT: vslidedown.vi v9, v11, 3
+; RV64-NEXT: vmv.x.s t5, v9
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v8, v8, v9
; RV64-NEXT: vmv.x.s t6, v8
-; RV64-NEXT: srli s0, a3, 32
-; RV64-NEXT: xor a3, a3, s0
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: add a0, a0, a6
-; RV64-NEXT: add a1, t0, a1
-; RV64-NEXT: add a2, t3, a2
-; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add a1, a1, t1
-; RV64-NEXT: add a2, a2, t4
-; RV64-NEXT: add a0, a3, a0
-; RV64-NEXT: add a1, a1, t2
-; RV64-NEXT: add a2, a2, t5
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a1, a1, a3
+; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: add t1, t2, t1
+; RV64-NEXT: add a0, t6, a0
+; RV64-NEXT: add a1, a1, a4
+; RV64-NEXT: add a5, a5, a7
+; RV64-NEXT: add t1, t1, t3
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, t6
-; RV64-NEXT: addw a0, a0, a2
-; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: add a5, a5, t0
+; RV64-NEXT: add t1, t1, t4
+; RV64-NEXT: add a0, a0, a5
+; RV64-NEXT: add t1, t1, t5
+; RV64-NEXT: addw a0, a0, t1
; RV64-NEXT: ret
%e0 = extractelement <16 x i32> %v, i32 0
%e1 = extractelement <16 x i32> %v, i32 1
@@ -1528,5 +1250,3 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
%add14 = add i32 %add13, %e15
ret i32 %add14
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
More information about the llvm-commits
mailing list