[llvm] cd03d97 - [RISCV] Add test coverage for sum reduction recognition in DAG
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 29 07:55:09 PDT 2023
Author: Philip Reames
Date: 2023-09-29T07:54:55-07:00
New Revision: cd03d970431e9aa5d7df1bbef85353c4f2a21d40
URL: https://github.com/llvm/llvm-project/commit/cd03d970431e9aa5d7df1bbef85353c4f2a21d40
DIFF: https://github.com/llvm/llvm-project/commit/cd03d970431e9aa5d7df1bbef85353c4f2a21d40.diff
LOG: [RISCV] Add test coverage for sum reduction recognition in DAG
And adjust an existing test to not be a simple reduction to preserve test intent.
Added:
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
Modified:
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 84ef9283802b901..d22505eac047886 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -29,15 +29,15 @@ define i8 @explode_4xi8(<4 x i8> %v) {
; CHECK-NEXT: vslidedown.vi v8, v8, 3
; CHECK-NEXT: vmv.x.s a3, v8
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: xor a0, a0, a2
+; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: ret
%e0 = extractelement <4 x i8> %v, i32 0
%e1 = extractelement <4 x i8> %v, i32 1
%e2 = extractelement <4 x i8> %v, i32 2
%e3 = extractelement <4 x i8> %v, i32 3
%add0 = add i8 %e0, %e1
- %add1 = add i8 %add0, %e2
+ %add1 = xor i8 %add0, %e2
%add2 = add i8 %add1, %e3
ret i8 %add2
}
@@ -63,12 +63,12 @@ define i8 @explode_8xi8(<8 x i8> %v) {
; CHECK-NEXT: vslidedown.vi v8, v8, 7
; CHECK-NEXT: vmv.x.s a7, v8
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: add a4, a4, a6
-; CHECK-NEXT: add a0, a0, a4
-; CHECK-NEXT: add a0, a0, a7
+; CHECK-NEXT: xor a0, a0, a2
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: add a3, a3, a5
+; CHECK-NEXT: add a0, a0, a3
+; CHECK-NEXT: add a6, a6, a7
+; CHECK-NEXT: add a0, a0, a6
; CHECK-NEXT: ret
%e0 = extractelement <8 x i8> %v, i32 0
%e1 = extractelement <8 x i8> %v, i32 1
@@ -79,7 +79,7 @@ define i8 @explode_8xi8(<8 x i8> %v) {
%e6 = extractelement <8 x i8> %v, i32 6
%e7 = extractelement <8 x i8> %v, i32 7
%add0 = add i8 %e0, %e1
- %add1 = add i8 %add0, %e2
+ %add1 = xor i8 %add0, %e2
%add2 = add i8 %add1, %e3
%add3 = add i8 %add2, %e4
%add4 = add i8 %add3, %e5
@@ -128,20 +128,20 @@ define i8 @explode_16xi8(<16 x i8> %v) {
; RV32-NEXT: vslidedown.vi v8, v8, 15
; RV32-NEXT: vmv.x.s s0, v8
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: add a4, a4, a6
-; RV32-NEXT: add a0, a0, a4
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: add a7, a7, t1
-; RV32-NEXT: add a7, a7, t2
-; RV32-NEXT: add a0, a0, a7
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: add t3, t3, t5
-; RV32-NEXT: add t3, t3, t6
-; RV32-NEXT: add t3, t3, s0
-; RV32-NEXT: add a0, a0, t3
+; RV32-NEXT: xor a0, a0, a2
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: add a6, a6, t0
+; RV32-NEXT: add a6, a6, t1
+; RV32-NEXT: add a0, a0, a6
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, t2, t5
+; RV32-NEXT: add t2, t2, t6
+; RV32-NEXT: add a0, a0, t2
+; RV32-NEXT: add a0, a0, s0
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -185,20 +185,20 @@ define i8 @explode_16xi8(<16 x i8> %v) {
; RV64-NEXT: vslidedown.vi v8, v8, 15
; RV64-NEXT: vmv.x.s s0, v8
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: add a4, a4, a6
-; RV64-NEXT: add a0, a0, a4
-; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: add a7, a7, t1
-; RV64-NEXT: add a7, a7, t2
-; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add t3, t3, t4
-; RV64-NEXT: add t3, t3, t5
-; RV64-NEXT: add t3, t3, t6
-; RV64-NEXT: add t3, t3, s0
-; RV64-NEXT: add a0, a0, t3
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a3, a3, a5
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: add a6, a6, t0
+; RV64-NEXT: add a6, a6, t1
+; RV64-NEXT: add a0, a0, a6
+; RV64-NEXT: add t2, t2, t3
+; RV64-NEXT: add t2, t2, t4
+; RV64-NEXT: add t2, t2, t5
+; RV64-NEXT: add t2, t2, t6
+; RV64-NEXT: add a0, a0, t2
+; RV64-NEXT: add a0, a0, s0
; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -219,7 +219,7 @@ define i8 @explode_16xi8(<16 x i8> %v) {
%e14 = extractelement <16 x i8> %v, i32 14
%e15 = extractelement <16 x i8> %v, i32 15
%add0 = add i8 %e0, %e1
- %add1 = add i8 %add0, %e2
+ %add1 = xor i8 %add0, %e2
%add2 = add i8 %add1, %e3
%add3 = add i8 %add2, %e4
%add4 = add i8 %add3, %e5
@@ -263,15 +263,15 @@ define i16 @explode_4xi16(<4 x i16> %v) {
; CHECK-NEXT: vslidedown.vi v8, v8, 3
; CHECK-NEXT: vmv.x.s a3, v8
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: xor a0, a0, a2
+; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: ret
%e0 = extractelement <4 x i16> %v, i32 0
%e1 = extractelement <4 x i16> %v, i32 1
%e2 = extractelement <4 x i16> %v, i32 2
%e3 = extractelement <4 x i16> %v, i32 3
%add0 = add i16 %e0, %e1
- %add1 = add i16 %add0, %e2
+ %add1 = xor i16 %add0, %e2
%add2 = add i16 %add1, %e3
ret i16 %add2
}
@@ -297,12 +297,12 @@ define i16 @explode_8xi16(<8 x i16> %v) {
; CHECK-NEXT: vslidedown.vi v8, v8, 7
; CHECK-NEXT: vmv.x.s a7, v8
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: add a4, a4, a6
-; CHECK-NEXT: add a0, a0, a4
-; CHECK-NEXT: add a0, a0, a7
+; CHECK-NEXT: xor a0, a0, a2
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: add a3, a3, a5
+; CHECK-NEXT: add a0, a0, a3
+; CHECK-NEXT: add a6, a6, a7
+; CHECK-NEXT: add a0, a0, a6
; CHECK-NEXT: ret
%e0 = extractelement <8 x i16> %v, i32 0
%e1 = extractelement <8 x i16> %v, i32 1
@@ -313,7 +313,7 @@ define i16 @explode_8xi16(<8 x i16> %v) {
%e6 = extractelement <8 x i16> %v, i32 6
%e7 = extractelement <8 x i16> %v, i32 7
%add0 = add i16 %e0, %e1
- %add1 = add i16 %add0, %e2
+ %add1 = xor i16 %add0, %e2
%add2 = add i16 %add1, %e3
%add3 = add i16 %add2, %e4
%add4 = add i16 %add3, %e5
@@ -363,20 +363,20 @@ define i16 @explode_16xi16(<16 x i16> %v) {
; RV32-NEXT: vslidedown.vi v8, v8, 15
; RV32-NEXT: vmv.x.s s0, v8
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: add a4, a4, a6
-; RV32-NEXT: add a0, a0, a4
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: add a7, a7, t1
-; RV32-NEXT: add a7, a7, t2
-; RV32-NEXT: add a0, a0, a7
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: add t3, t3, t5
-; RV32-NEXT: add t3, t3, t6
-; RV32-NEXT: add t3, t3, s0
-; RV32-NEXT: add a0, a0, t3
+; RV32-NEXT: xor a0, a0, a2
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: add a6, a6, t0
+; RV32-NEXT: add a6, a6, t1
+; RV32-NEXT: add a0, a0, a6
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, t2, t5
+; RV32-NEXT: add t2, t2, t6
+; RV32-NEXT: add a0, a0, t2
+; RV32-NEXT: add a0, a0, s0
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -421,20 +421,20 @@ define i16 @explode_16xi16(<16 x i16> %v) {
; RV64-NEXT: vslidedown.vi v8, v8, 15
; RV64-NEXT: vmv.x.s s0, v8
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: add a4, a4, a6
-; RV64-NEXT: add a0, a0, a4
-; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: add a7, a7, t1
-; RV64-NEXT: add a7, a7, t2
-; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add t3, t3, t4
-; RV64-NEXT: add t3, t3, t5
-; RV64-NEXT: add t3, t3, t6
-; RV64-NEXT: add t3, t3, s0
-; RV64-NEXT: add a0, a0, t3
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a3, a3, a5
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: add a6, a6, t0
+; RV64-NEXT: add a6, a6, t1
+; RV64-NEXT: add a0, a0, a6
+; RV64-NEXT: add t2, t2, t3
+; RV64-NEXT: add t2, t2, t4
+; RV64-NEXT: add t2, t2, t5
+; RV64-NEXT: add t2, t2, t6
+; RV64-NEXT: add a0, a0, t2
+; RV64-NEXT: add a0, a0, s0
; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
@@ -455,7 +455,7 @@ define i16 @explode_16xi16(<16 x i16> %v) {
%e14 = extractelement <16 x i16> %v, i32 14
%e15 = extractelement <16 x i16> %v, i32 15
%add0 = add i16 %e0, %e1
- %add1 = add i16 %add0, %e2
+ %add1 = xor i16 %add0, %e2
%add2 = add i16 %add1, %e3
%add3 = add i16 %add2, %e4
%add4 = add i16 %add3, %e5
@@ -508,8 +508,8 @@ define i32 @explode_4xi32(<4 x i32> %v) {
; RV32-NEXT: vslidedown.vi v8, v8, 3
; RV32-NEXT: vmv.x.s a3, v8
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: xor a0, a0, a2
+; RV32-NEXT: add a0, a0, a3
; RV32-NEXT: ret
;
; RV64-LABEL: explode_4xi32:
@@ -523,15 +523,15 @@ define i32 @explode_4xi32(<4 x i32> %v) {
; RV64-NEXT: vslidedown.vi v8, v8, 3
; RV64-NEXT: vmv.x.s a3, v8
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: addw a0, a0, a2
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: addw a0, a0, a3
; RV64-NEXT: ret
%e0 = extractelement <4 x i32> %v, i32 0
%e1 = extractelement <4 x i32> %v, i32 1
%e2 = extractelement <4 x i32> %v, i32 2
%e3 = extractelement <4 x i32> %v, i32 3
%add0 = add i32 %e0, %e1
- %add1 = add i32 %add0, %e2
+ %add1 = xor i32 %add0, %e2
%add2 = add i32 %add1, %e3
ret i32 %add2
}
@@ -558,12 +558,12 @@ define i32 @explode_8xi32(<8 x i32> %v) {
; RV32-NEXT: vslidedown.vi v8, v8, 7
; RV32-NEXT: vmv.x.s a7, v8
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: add a4, a4, a6
-; RV32-NEXT: add a0, a0, a4
-; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: xor a0, a0, a2
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: add a0, a0, a6
; RV32-NEXT: ret
;
; RV64-LABEL: explode_8xi32:
@@ -586,12 +586,12 @@ define i32 @explode_8xi32(<8 x i32> %v) {
; RV64-NEXT: vslidedown.vi v8, v8, 7
; RV64-NEXT: vmv.x.s a7, v8
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: add a4, a4, a6
-; RV64-NEXT: add a0, a0, a4
-; RV64-NEXT: addw a0, a0, a7
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a3, a3, a5
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: addw a0, a0, a6
; RV64-NEXT: ret
%e0 = extractelement <8 x i32> %v, i32 0
%e1 = extractelement <8 x i32> %v, i32 1
@@ -602,7 +602,7 @@ define i32 @explode_8xi32(<8 x i32> %v) {
%e6 = extractelement <8 x i32> %v, i32 6
%e7 = extractelement <8 x i32> %v, i32 7
%add0 = add i32 %e0, %e1
- %add1 = add i32 %add0, %e2
+ %add1 = xor i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
@@ -654,13 +654,13 @@ define i32 @explode_16xi32(<16 x i32> %v) {
; RV32-NEXT: lw t6, 56(sp)
; RV32-NEXT: lw s2, 60(sp)
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: add a4, a4, a6
-; RV32-NEXT: add a0, a0, a4
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: xor a0, a0, a2
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: add a6, a6, t0
+; RV32-NEXT: add a0, a0, a6
; RV32-NEXT: add t1, t1, t2
; RV32-NEXT: add t1, t1, t3
; RV32-NEXT: add a0, a0, t1
@@ -717,13 +717,13 @@ define i32 @explode_16xi32(<16 x i32> %v) {
; RV64-NEXT: lw t6, 56(sp)
; RV64-NEXT: lw s2, 60(sp)
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: add a4, a4, a6
-; RV64-NEXT: add a0, a0, a4
-; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: add a3, a3, a4
+; RV64-NEXT: add a3, a3, a5
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: add a6, a6, t0
+; RV64-NEXT: add a0, a0, a6
; RV64-NEXT: add t1, t1, t2
; RV64-NEXT: add t1, t1, t3
; RV64-NEXT: add a0, a0, t1
@@ -754,7 +754,7 @@ define i32 @explode_16xi32(<16 x i32> %v) {
%e14 = extractelement <16 x i32> %v, i32 14
%e15 = extractelement <16 x i32> %v, i32 15
%add0 = add i32 %e0, %e1
- %add1 = add i32 %add0, %e2
+ %add1 = xor i32 %add0, %e2
%add2 = add i32 %add1, %e3
%add3 = add i32 %add2, %e4
%add4 = add i32 %add3, %e5
@@ -821,19 +821,17 @@ define i64 @explode_4xi64(<4 x i64> %v) {
; RV32-NEXT: vmv.x.s a6, v10
; RV32-NEXT: vslidedown.vi v8, v8, 3
; RV32-NEXT: vsrl.vx v10, v8, a0
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vmv.x.s a7, v8
+; RV32-NEXT: vmv.x.s a7, v10
+; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: add a1, a1, a3
; RV32-NEXT: add a4, a2, a4
; RV32-NEXT: sltu a2, a4, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a6, a4, a6
-; RV32-NEXT: sltu a2, a6, a4
-; RV32-NEXT: add a1, a1, a5
+; RV32-NEXT: xor a1, a1, a5
+; RV32-NEXT: xor a2, a4, a6
; RV32-NEXT: add a0, a2, a0
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: add a0, a6, a7
-; RV32-NEXT: sltu a2, a0, a6
+; RV32-NEXT: sltu a2, a0, a2
+; RV32-NEXT: add a1, a1, a7
; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: ret
;
@@ -849,15 +847,15 @@ define i64 @explode_4xi64(<4 x i64> %v) {
; RV64-NEXT: vslidedown.vi v8, v8, 3
; RV64-NEXT: vmv.x.s a3, v8
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: add a0, a0, a3
; RV64-NEXT: ret
%e0 = extractelement <4 x i64> %v, i32 0
%e1 = extractelement <4 x i64> %v, i32 1
%e2 = extractelement <4 x i64> %v, i32 2
%e3 = extractelement <4 x i64> %v, i32 3
%add0 = add i64 %e0, %e1
- %add1 = add i64 %add0, %e2
+ %add1 = xor i64 %add0, %e2
%add2 = add i64 %add1, %e3
ret i64 %add2
}
@@ -907,13 +905,11 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV32-NEXT: add a4, a2, a4
; RV32-NEXT: sltu a2, a4, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a6, a4, a6
-; RV32-NEXT: sltu a2, a6, a4
-; RV32-NEXT: add a1, a1, a5
-; RV32-NEXT: add a2, a2, a7
-; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add t0, a6, t0
-; RV32-NEXT: sltu a2, t0, a6
+; RV32-NEXT: xor a1, a1, a5
+; RV32-NEXT: xor a2, a4, a6
+; RV32-NEXT: add t0, a2, t0
+; RV32-NEXT: sltu a2, t0, a2
+; RV32-NEXT: add a1, a1, a7
; RV32-NEXT: add a2, a2, t1
; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: add t2, t0, t2
@@ -963,12 +959,12 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV64-NEXT: ld a6, 48(sp)
; RV64-NEXT: ld a7, 56(sp)
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a4, a4, a5
; RV64-NEXT: add a0, a0, a4
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: add a0, a0, a5
-; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: add a0, a0, a6
; RV64-NEXT: addi sp, s0, -128
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
@@ -983,7 +979,7 @@ define i64 @explode_8xi64(<8 x i64> %v) {
%e6 = extractelement <8 x i64> %v, i32 6
%e7 = extractelement <8 x i64> %v, i32 7
%add0 = add i64 %e0, %e1
- %add1 = add i64 %add0, %e2
+ %add1 = xor i64 %add0, %e2
%add2 = add i64 %add1, %e3
%add3 = add i64 %add2, %e4
%add4 = add i64 %add3, %e5
@@ -1031,113 +1027,111 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vslidedown.vi v16, v8, 1
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s11, v24
-; RV32-NEXT: vmv.x.s ra, v16
-; RV32-NEXT: vslidedown.vi v16, v8, 2
-; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s a3, v24
; RV32-NEXT: vmv.x.s a4, v16
+; RV32-NEXT: vslidedown.vi v16, v8, 2
+; RV32-NEXT: vsrl.vx v24, v16, a1
+; RV32-NEXT: vmv.x.s a5, v24
+; RV32-NEXT: vmv.x.s a6, v16
; RV32-NEXT: vslidedown.vi v16, v8, 3
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s0, v24
-; RV32-NEXT: vmv.x.s a5, v16
+; RV32-NEXT: vmv.x.s a7, v24
+; RV32-NEXT: vmv.x.s t0, v16
; RV32-NEXT: vslidedown.vi v16, v8, 4
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s1, v24
-; RV32-NEXT: vmv.x.s a6, v16
+; RV32-NEXT: vmv.x.s s3, v24
+; RV32-NEXT: vmv.x.s t1, v16
; RV32-NEXT: vslidedown.vi v16, v8, 5
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s2, v24
-; RV32-NEXT: vmv.x.s a7, v16
+; RV32-NEXT: vmv.x.s s4, v24
+; RV32-NEXT: vmv.x.s t2, v16
; RV32-NEXT: vslidedown.vi v16, v8, 6
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s3, v24
-; RV32-NEXT: vmv.x.s t0, v16
+; RV32-NEXT: vmv.x.s s5, v24
+; RV32-NEXT: vmv.x.s t3, v16
; RV32-NEXT: vslidedown.vi v16, v8, 7
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s4, v24
-; RV32-NEXT: vmv.x.s t1, v16
+; RV32-NEXT: vmv.x.s s6, v24
+; RV32-NEXT: vmv.x.s t4, v16
; RV32-NEXT: vslidedown.vi v16, v8, 8
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s5, v24
-; RV32-NEXT: vmv.x.s t2, v16
+; RV32-NEXT: vmv.x.s s7, v24
+; RV32-NEXT: vmv.x.s t5, v16
; RV32-NEXT: vslidedown.vi v16, v8, 9
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s6, v24
-; RV32-NEXT: vmv.x.s t3, v16
+; RV32-NEXT: vmv.x.s s8, v24
+; RV32-NEXT: vmv.x.s t6, v16
; RV32-NEXT: vslidedown.vi v16, v8, 10
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s7, v24
-; RV32-NEXT: vmv.x.s t4, v16
+; RV32-NEXT: vmv.x.s s9, v24
+; RV32-NEXT: vmv.x.s s0, v16
; RV32-NEXT: vslidedown.vi v16, v8, 11
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s8, v24
-; RV32-NEXT: vmv.x.s t5, v16
+; RV32-NEXT: vmv.x.s s10, v24
+; RV32-NEXT: vmv.x.s s1, v16
; RV32-NEXT: vslidedown.vi v16, v8, 12
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s9, v24
-; RV32-NEXT: vmv.x.s t6, v16
+; RV32-NEXT: vmv.x.s s11, v24
+; RV32-NEXT: vmv.x.s s2, v16
; RV32-NEXT: vslidedown.vi v24, v8, 13
; RV32-NEXT: vsrl.vx v16, v24, a1
-; RV32-NEXT: vmv.x.s s10, v16
+; RV32-NEXT: vmv.x.s ra, v16
; RV32-NEXT: vslidedown.vi v16, v8, 14
; RV32-NEXT: vsrl.vx v0, v16, a1
; RV32-NEXT: vslidedown.vi v8, v8, 15
; RV32-NEXT: vmv.x.s a2, v24
; RV32-NEXT: vsrl.vx v24, v8, a1
; RV32-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: add s11, a1, s11
-; RV32-NEXT: add ra, a0, ra
-; RV32-NEXT: sltu a0, ra, a0
-; RV32-NEXT: add a0, s11, a0
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: add a4, ra, a4
-; RV32-NEXT: sltu a1, a4, ra
-; RV32-NEXT: add a1, a1, s0
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a5, a4, a5
-; RV32-NEXT: sltu a1, a5, a4
-; RV32-NEXT: add a1, a1, s1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a6, a5, a6
-; RV32-NEXT: sltu a1, a6, a5
-; RV32-NEXT: add a1, a1, s2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a7, a6, a7
-; RV32-NEXT: sltu a1, a7, a6
+; RV32-NEXT: add a3, a1, a3
+; RV32-NEXT: add a4, a0, a4
+; RV32-NEXT: sltu a0, a4, a0
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: xor a0, a0, a5
+; RV32-NEXT: xor a1, a4, a6
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: add t0, a1, t0
+; RV32-NEXT: sltu a1, t0, a1
; RV32-NEXT: add a1, a1, s3
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t0, a7, t0
-; RV32-NEXT: sltu a1, t0, a7
-; RV32-NEXT: add a1, a1, s4
-; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t1, t0, t1
; RV32-NEXT: sltu a1, t1, t0
-; RV32-NEXT: add a1, a1, s5
+; RV32-NEXT: add a1, a1, s4
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t2, t1, t2
; RV32-NEXT: sltu a1, t2, t1
-; RV32-NEXT: add a1, a1, s6
+; RV32-NEXT: add a1, a1, s5
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t3, t2, t3
; RV32-NEXT: sltu a1, t3, t2
-; RV32-NEXT: add a1, a1, s7
+; RV32-NEXT: add a1, a1, s6
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t4, t3, t4
; RV32-NEXT: sltu a1, t4, t3
-; RV32-NEXT: add a1, a1, s8
+; RV32-NEXT: add a1, a1, s7
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t5, t4, t5
; RV32-NEXT: sltu a1, t5, t4
-; RV32-NEXT: add a1, a1, s9
+; RV32-NEXT: add a1, a1, s8
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t6, t5, t6
; RV32-NEXT: sltu a1, t6, t5
+; RV32-NEXT: add a1, a1, s9
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add s0, t6, s0
+; RV32-NEXT: sltu a1, s0, t6
; RV32-NEXT: add a1, a1, s10
; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add s1, s0, s1
+; RV32-NEXT: sltu a1, s1, s0
+; RV32-NEXT: add a1, a1, s11
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add s2, s1, s2
+; RV32-NEXT: sltu a1, s2, s1
+; RV32-NEXT: add a1, a1, ra
+; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: add a2, t6, a2
-; RV32-NEXT: sltu a3, a2, t6
+; RV32-NEXT: add a2, s2, a2
+; RV32-NEXT: sltu a3, a2, s2
; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: vmv.x.s a3, v16
; RV32-NEXT: add a0, a0, a1
@@ -1204,20 +1198,20 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV64-NEXT: ld t6, 112(sp)
; RV64-NEXT: ld s2, 120(sp)
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: xor a0, a0, a2
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a4, a4, a5
; RV64-NEXT: add a0, a0, a4
-; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: add a0, a0, a5
-; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: add a7, a7, t1
-; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add t2, t2, t3
-; RV64-NEXT: add t2, t2, t4
-; RV64-NEXT: add t2, t2, t5
-; RV64-NEXT: add a0, a0, t2
-; RV64-NEXT: add t6, t6, s2
-; RV64-NEXT: add a0, a0, t6
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: add a6, a6, t0
+; RV64-NEXT: add a0, a0, a6
+; RV64-NEXT: add t1, t1, t2
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add t1, t1, t4
+; RV64-NEXT: add a0, a0, t1
+; RV64-NEXT: add t5, t5, t6
+; RV64-NEXT: add t5, t5, s2
+; RV64-NEXT: add a0, a0, t5
; RV64-NEXT: addi sp, s0, -256
; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload
@@ -1241,7 +1235,7 @@ define i64 @explode_16xi64(<16 x i64> %v) {
%e14 = extractelement <16 x i64> %v, i32 14
%e15 = extractelement <16 x i64> %v, i32 15
%add0 = add i64 %e0, %e1
- %add1 = add i64 %add0, %e2
+ %add1 = xor i64 %add0, %e2
%add2 = add i64 %add1, %e3
%add3 = add i64 %add2, %e4
%add4 = add i64 %add3, %e5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
new file mode 100644
index 000000000000000..5ef6b291e309517
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -0,0 +1,1243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define i32 @reduce_sum_2xi32(<2 x i32> %v) {
+; RV32-LABEL: reduce_sum_2xi32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_2xi32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: addw a0, a0, a1
+; RV64-NEXT: ret
+ %e0 = extractelement <2 x i32> %v, i32 0
+ %e1 = extractelement <2 x i32> %v, i32 1
+ %add0 = add i32 %e0, %e1
+ ret i32 %add0
+}
+
+define i32 @reduce_sum_4xi32(<4 x i32> %v) {
+; RV32-LABEL: reduce_sum_4xi32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vmv.x.s a2, v9
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vmv.x.s a3, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_4xi32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vmv.x.s a2, v9
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vmv.x.s a3, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: addw a0, a0, a2
+; RV64-NEXT: ret
+ %e0 = extractelement <4 x i32> %v, i32 0
+ %e1 = extractelement <4 x i32> %v, i32 1
+ %e2 = extractelement <4 x i32> %v, i32 2
+ %e3 = extractelement <4 x i32> %v, i32 3
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ ret i32 %add2
+}
+
+
+define i32 @reduce_sum_8xi32(<8 x i32> %v) {
+; RV32-LABEL: reduce_sum_8xi32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 2
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 3
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 4
+; RV32-NEXT: vmv.x.s a4, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 5
+; RV32-NEXT: vmv.x.s a5, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 6
+; RV32-NEXT: vmv.x.s a6, v10
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vmv.x.s a7, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_8xi32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vmv.x.s a1, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 3
+; RV64-NEXT: vmv.x.s a3, v10
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 4
+; RV64-NEXT: vmv.x.s a4, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 5
+; RV64-NEXT: vmv.x.s a5, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 6
+; RV64-NEXT: vmv.x.s a6, v10
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vmv.x.s a7, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a0, a0, a4
+; RV64-NEXT: addw a0, a0, a7
+; RV64-NEXT: ret
+ %e0 = extractelement <8 x i32> %v, i32 0
+ %e1 = extractelement <8 x i32> %v, i32 1
+ %e2 = extractelement <8 x i32> %v, i32 2
+ %e3 = extractelement <8 x i32> %v, i32 3
+ %e4 = extractelement <8 x i32> %v, i32 4
+ %e5 = extractelement <8 x i32> %v, i32 5
+ %e6 = extractelement <8 x i32> %v, i32 6
+ %e7 = extractelement <8 x i32> %v, i32 7
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ ret i32 %add6
+}
+
+define i32 @reduce_sum_16xi32(<16 x i32> %v) {
+; RV32-LABEL: reduce_sum_16xi32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -128
+; RV32-NEXT: .cfi_def_cfa_offset 128
+; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: .cfi_offset s2, -12
+; RV32-NEXT: addi s0, sp, 128
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vslidedown.vi v12, v8, 1
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vmv.x.s a2, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 4
+; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 5
+; RV32-NEXT: vmv.x.s a5, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 6
+; RV32-NEXT: vmv.x.s a6, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 7
+; RV32-NEXT: vmv.x.s a7, v12
+; RV32-NEXT: mv t0, sp
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (t0)
+; RV32-NEXT: lw t0, 32(sp)
+; RV32-NEXT: lw t1, 36(sp)
+; RV32-NEXT: lw t2, 40(sp)
+; RV32-NEXT: lw t3, 44(sp)
+; RV32-NEXT: lw t4, 48(sp)
+; RV32-NEXT: lw t5, 52(sp)
+; RV32-NEXT: lw t6, 56(sp)
+; RV32-NEXT: lw s2, 60(sp)
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: add t1, t1, t3
+; RV32-NEXT: add a0, a0, t1
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, t4, t6
+; RV32-NEXT: add t4, t4, s2
+; RV32-NEXT: add a0, a0, t4
+; RV32-NEXT: addi sp, s0, -128
+; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -128
+; RV64-NEXT: .cfi_def_cfa_offset 128
+; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: .cfi_offset s2, -24
+; RV64-NEXT: addi s0, sp, 128
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vslidedown.vi v12, v8, 1
+; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 4
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 5
+; RV64-NEXT: vmv.x.s a5, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 6
+; RV64-NEXT: vmv.x.s a6, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 7
+; RV64-NEXT: vmv.x.s a7, v12
+; RV64-NEXT: mv t0, sp
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (t0)
+; RV64-NEXT: lw t0, 32(sp)
+; RV64-NEXT: lw t1, 36(sp)
+; RV64-NEXT: lw t2, 40(sp)
+; RV64-NEXT: lw t3, 44(sp)
+; RV64-NEXT: lw t4, 48(sp)
+; RV64-NEXT: lw t5, 52(sp)
+; RV64-NEXT: lw t6, 56(sp)
+; RV64-NEXT: lw s2, 60(sp)
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a0, a0, a4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: add t1, t1, t2
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add a0, a0, t1
+; RV64-NEXT: add t4, t4, t5
+; RV64-NEXT: add t4, t4, t6
+; RV64-NEXT: add t4, t4, s2
+; RV64-NEXT: addw a0, a0, t4
+; RV64-NEXT: addi sp, s0, -128
+; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: ret
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %e8 = extractelement <16 x i32> %v, i32 8
+ %e9 = extractelement <16 x i32> %v, i32 9
+ %e10 = extractelement <16 x i32> %v, i32 10
+ %e11 = extractelement <16 x i32> %v, i32 11
+ %e12 = extractelement <16 x i32> %v, i32 12
+ %e13 = extractelement <16 x i32> %v, i32 13
+ %e14 = extractelement <16 x i32> %v, i32 14
+ %e15 = extractelement <16 x i32> %v, i32 15
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ %add7 = add i32 %add6, %e8
+ %add8 = add i32 %add7, %e9
+ %add9 = add i32 %add8, %e10
+ %add10 = add i32 %add9, %e11
+ %add11 = add i32 %add10, %e12
+ %add12 = add i32 %add11, %e13
+ %add13 = add i32 %add12, %e14
+ %add14 = add i32 %add13, %e15
+ ret i32 %add14
+}
+
+define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix2:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix2:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: addw a0, a0, a1
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %add0 = add i32 %e0, %e1
+ ret i32 %add0
+}
+
+define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix3:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vslidedown.vi v8, v8, 2
+; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix3:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vslidedown.vi v8, v8, 2
+; RV64-NEXT: vmv.x.s a2, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: addw a0, a0, a2
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ ret i32 %add1
+}
+
+define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix4:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vmv.x.s a2, v9
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vmv.x.s a3, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix4:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vmv.x.s a2, v9
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vmv.x.s a3, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: addw a0, a0, a2
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ ret i32 %add2
+}
+
+define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix5:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 2
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 3
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 4
+; RV32-NEXT: vmv.x.s a4, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix5:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vmv.x.s a1, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 3
+; RV64-NEXT: vmv.x.s a3, v10
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 4
+; RV64-NEXT: vmv.x.s a4, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: addw a0, a0, a4
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ ret i32 %add3
+}
+
+define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix6:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 2
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 3
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 4
+; RV32-NEXT: vmv.x.s a4, v10
+; RV32-NEXT: vslidedown.vi v8, v8, 5
+; RV32-NEXT: vmv.x.s a5, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix6:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vmv.x.s a1, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 3
+; RV64-NEXT: vmv.x.s a3, v10
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 4
+; RV64-NEXT: vmv.x.s a4, v10
+; RV64-NEXT: vslidedown.vi v8, v8, 5
+; RV64-NEXT: vmv.x.s a5, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: addw a0, a0, a4
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ ret i32 %add4
+}
+
+define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix7:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 2
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 3
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 4
+; RV32-NEXT: vmv.x.s a4, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 5
+; RV32-NEXT: vmv.x.s a5, v10
+; RV32-NEXT: vslidedown.vi v8, v8, 6
+; RV32-NEXT: vmv.x.s a6, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix7:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vmv.x.s a1, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 3
+; RV64-NEXT: vmv.x.s a3, v10
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 4
+; RV64-NEXT: vmv.x.s a4, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 5
+; RV64-NEXT: vmv.x.s a5, v10
+; RV64-NEXT: vslidedown.vi v8, v8, 6
+; RV64-NEXT: vmv.x.s a6, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: addw a0, a0, a4
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ ret i32 %add5
+}
+
+define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 2
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 3
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 4
+; RV32-NEXT: vmv.x.s a4, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 5
+; RV32-NEXT: vmv.x.s a5, v10
+; RV32-NEXT: vslidedown.vi v10, v8, 6
+; RV32-NEXT: vmv.x.s a6, v10
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vmv.x.s a7, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vmv.x.s a1, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 3
+; RV64-NEXT: vmv.x.s a3, v10
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 4
+; RV64-NEXT: vmv.x.s a4, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 5
+; RV64-NEXT: vmv.x.s a5, v10
+; RV64-NEXT: vslidedown.vi v10, v8, 6
+; RV64-NEXT: vmv.x.s a6, v10
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vmv.x.s a7, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a0, a0, a4
+; RV64-NEXT: addw a0, a0, a7
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ ret i32 %add6
+}
+
+define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix9:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -128
+; RV32-NEXT: .cfi_def_cfa_offset 128
+; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: addi s0, sp, 128
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 1
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vmv.x.s a2, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 4
+; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 5
+; RV32-NEXT: vmv.x.s a5, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 6
+; RV32-NEXT: vmv.x.s a6, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 7
+; RV32-NEXT: vmv.x.s a7, v12
+; RV32-NEXT: mv t0, sp
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (t0)
+; RV32-NEXT: lw t0, 32(sp)
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: addi sp, s0, -128
+; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix9:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -128
+; RV64-NEXT: .cfi_def_cfa_offset 128
+; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 128
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 1
+; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 4
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 5
+; RV64-NEXT: vmv.x.s a5, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 6
+; RV64-NEXT: vmv.x.s a6, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 7
+; RV64-NEXT: vmv.x.s a7, v12
+; RV64-NEXT: mv t0, sp
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (t0)
+; RV64-NEXT: lw t0, 32(sp)
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a0, a0, a4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: addw a0, a0, a7
+; RV64-NEXT: addi sp, s0, -128
+; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %e8 = extractelement <16 x i32> %v, i32 8
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ %add7 = add i32 %add6, %e8
+ ret i32 %add7
+}
+
+define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix13:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -128
+; RV32-NEXT: .cfi_def_cfa_offset 128
+; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: addi s0, sp, 128
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 1
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vmv.x.s a2, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 4
+; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 5
+; RV32-NEXT: vmv.x.s a5, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 6
+; RV32-NEXT: vmv.x.s a6, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 7
+; RV32-NEXT: vmv.x.s a7, v12
+; RV32-NEXT: mv t0, sp
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (t0)
+; RV32-NEXT: lw t0, 32(sp)
+; RV32-NEXT: lw t1, 36(sp)
+; RV32-NEXT: lw t2, 40(sp)
+; RV32-NEXT: lw t3, 44(sp)
+; RV32-NEXT: lw t4, 48(sp)
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, a7, t1
+; RV32-NEXT: add a7, a7, t2
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: add a0, a0, t3
+; RV32-NEXT: addi sp, s0, -128
+; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix13:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -128
+; RV64-NEXT: .cfi_def_cfa_offset 128
+; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 128
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 1
+; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 4
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 5
+; RV64-NEXT: vmv.x.s a5, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 6
+; RV64-NEXT: vmv.x.s a6, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 7
+; RV64-NEXT: vmv.x.s a7, v12
+; RV64-NEXT: mv t0, sp
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (t0)
+; RV64-NEXT: lw t0, 32(sp)
+; RV64-NEXT: lw t1, 36(sp)
+; RV64-NEXT: lw t2, 40(sp)
+; RV64-NEXT: lw t3, 44(sp)
+; RV64-NEXT: lw t4, 48(sp)
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a0, a0, a4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: add a7, a7, t1
+; RV64-NEXT: add a7, a7, t2
+; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: addw a0, a0, t3
+; RV64-NEXT: addi sp, s0, -128
+; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %e8 = extractelement <16 x i32> %v, i32 8
+ %e9 = extractelement <16 x i32> %v, i32 9
+ %e10 = extractelement <16 x i32> %v, i32 10
+ %e11 = extractelement <16 x i32> %v, i32 11
+ %e12 = extractelement <16 x i32> %v, i32 12
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ %add7 = add i32 %add6, %e8
+ %add8 = add i32 %add7, %e9
+ %add9 = add i32 %add8, %e10
+ %add10 = add i32 %add9, %e11
+ %add11 = add i32 %add10, %e12
+ ret i32 %add11
+}
+
+
+define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix14:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -128
+; RV32-NEXT: .cfi_def_cfa_offset 128
+; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: addi s0, sp, 128
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 1
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vmv.x.s a2, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 4
+; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 5
+; RV32-NEXT: vmv.x.s a5, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 6
+; RV32-NEXT: vmv.x.s a6, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 7
+; RV32-NEXT: vmv.x.s a7, v12
+; RV32-NEXT: mv t0, sp
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (t0)
+; RV32-NEXT: lw t0, 32(sp)
+; RV32-NEXT: lw t1, 36(sp)
+; RV32-NEXT: lw t2, 40(sp)
+; RV32-NEXT: lw t3, 44(sp)
+; RV32-NEXT: lw t4, 48(sp)
+; RV32-NEXT: lw t5, 52(sp)
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, a7, t1
+; RV32-NEXT: add a7, a7, t2
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: add t3, t3, t5
+; RV32-NEXT: add a0, a0, t3
+; RV32-NEXT: addi sp, s0, -128
+; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix14:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -128
+; RV64-NEXT: .cfi_def_cfa_offset 128
+; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 128
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 1
+; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 4
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 5
+; RV64-NEXT: vmv.x.s a5, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 6
+; RV64-NEXT: vmv.x.s a6, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 7
+; RV64-NEXT: vmv.x.s a7, v12
+; RV64-NEXT: mv t0, sp
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (t0)
+; RV64-NEXT: lw t0, 32(sp)
+; RV64-NEXT: lw t1, 36(sp)
+; RV64-NEXT: lw t2, 40(sp)
+; RV64-NEXT: lw t3, 44(sp)
+; RV64-NEXT: lw t4, 48(sp)
+; RV64-NEXT: lw t5, 52(sp)
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a0, a0, a4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: add a7, a7, t1
+; RV64-NEXT: add a7, a7, t2
+; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: add t3, t3, t5
+; RV64-NEXT: addw a0, a0, t3
+; RV64-NEXT: addi sp, s0, -128
+; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %e8 = extractelement <16 x i32> %v, i32 8
+ %e9 = extractelement <16 x i32> %v, i32 9
+ %e10 = extractelement <16 x i32> %v, i32 10
+ %e11 = extractelement <16 x i32> %v, i32 11
+ %e12 = extractelement <16 x i32> %v, i32 12
+ %e13 = extractelement <16 x i32> %v, i32 13
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ %add7 = add i32 %add6, %e8
+ %add8 = add i32 %add7, %e9
+ %add9 = add i32 %add8, %e10
+ %add10 = add i32 %add9, %e11
+ %add11 = add i32 %add10, %e12
+ %add12 = add i32 %add11, %e13
+ ret i32 %add12
+}
+
+define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix15:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -128
+; RV32-NEXT: .cfi_def_cfa_offset 128
+; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: addi s0, sp, 128
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 1
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vmv.x.s a2, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 4
+; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 5
+; RV32-NEXT: vmv.x.s a5, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 6
+; RV32-NEXT: vmv.x.s a6, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 7
+; RV32-NEXT: vmv.x.s a7, v12
+; RV32-NEXT: mv t0, sp
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (t0)
+; RV32-NEXT: lw t0, 32(sp)
+; RV32-NEXT: lw t1, 36(sp)
+; RV32-NEXT: lw t2, 40(sp)
+; RV32-NEXT: lw t3, 44(sp)
+; RV32-NEXT: lw t4, 48(sp)
+; RV32-NEXT: lw t5, 52(sp)
+; RV32-NEXT: lw t6, 56(sp)
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, a4, a6
+; RV32-NEXT: add a0, a0, a4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, a7, t1
+; RV32-NEXT: add a7, a7, t2
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: add t3, t3, t5
+; RV32-NEXT: add t3, t3, t6
+; RV32-NEXT: add a0, a0, t3
+; RV32-NEXT: addi sp, s0, -128
+; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix15:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -128
+; RV64-NEXT: .cfi_def_cfa_offset 128
+; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 128
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 1
+; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vmv.x.s a2, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 4
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 5
+; RV64-NEXT: vmv.x.s a5, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 6
+; RV64-NEXT: vmv.x.s a6, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 7
+; RV64-NEXT: vmv.x.s a7, v12
+; RV64-NEXT: mv t0, sp
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (t0)
+; RV64-NEXT: lw t0, 32(sp)
+; RV64-NEXT: lw t1, 36(sp)
+; RV64-NEXT: lw t2, 40(sp)
+; RV64-NEXT: lw t3, 44(sp)
+; RV64-NEXT: lw t4, 48(sp)
+; RV64-NEXT: lw t5, 52(sp)
+; RV64-NEXT: lw t6, 56(sp)
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a0, a0, a4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: add a7, a7, t1
+; RV64-NEXT: add a7, a7, t2
+; RV64-NEXT: add a0, a0, a7
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: add t3, t3, t5
+; RV64-NEXT: add t3, t3, t6
+; RV64-NEXT: addw a0, a0, t3
+; RV64-NEXT: addi sp, s0, -128
+; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: ret
+ %v = load <16 x i32>, ptr %p, align 256
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %e8 = extractelement <16 x i32> %v, i32 8
+ %e9 = extractelement <16 x i32> %v, i32 9
+ %e10 = extractelement <16 x i32> %v, i32 10
+ %e11 = extractelement <16 x i32> %v, i32 11
+ %e12 = extractelement <16 x i32> %v, i32 12
+ %e13 = extractelement <16 x i32> %v, i32 13
+ %e14 = extractelement <16 x i32> %v, i32 14
+ %add0 = add i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ %add7 = add i32 %add6, %e8
+ %add8 = add i32 %add7, %e9
+ %add9 = add i32 %add8, %e10
+ %add10 = add i32 %add9, %e11
+ %add11 = add i32 %add10, %e12
+ %add12 = add i32 %add11, %e13
+ %add13 = add i32 %add12, %e14
+ ret i32 %add13
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
More information about the llvm-commits
mailing list