[llvm] cd03d97 - [RISCV] Add test coverage for sum reduction recognition in DAG

Philip Reames via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 29 07:55:09 PDT 2023


Author: Philip Reames
Date: 2023-09-29T07:54:55-07:00
New Revision: cd03d970431e9aa5d7df1bbef85353c4f2a21d40

URL: https://github.com/llvm/llvm-project/commit/cd03d970431e9aa5d7df1bbef85353c4f2a21d40
DIFF: https://github.com/llvm/llvm-project/commit/cd03d970431e9aa5d7df1bbef85353c4f2a21d40.diff

LOG: [RISCV] Add test coverage for sum reduction recognition in DAG

And adjust an existing test to not be a simple reduction to preserve test intent.

Added: 
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll

Modified: 
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 84ef9283802b901..d22505eac047886 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -29,15 +29,15 @@ define i8 @explode_4xi8(<4 x i8> %v) {
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 3
 ; CHECK-NEXT:    vmv.x.s a3, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    ret
   %e0 = extractelement <4 x i8> %v, i32 0
   %e1 = extractelement <4 x i8> %v, i32 1
   %e2 = extractelement <4 x i8> %v, i32 2
   %e3 = extractelement <4 x i8> %v, i32 3
   %add0 = add i8 %e0, %e1
-  %add1 = add i8 %add0, %e2
+  %add1 = xor i8 %add0, %e2
   %add2 = add i8 %add1, %e3
   ret i8 %add2
 }
@@ -63,12 +63,12 @@ define i8 @explode_8xi8(<8 x i8> %v) {
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a7, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, a4, a6
-; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    add a0, a0, a7
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    add a3, a3, a5
+; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    add a6, a6, a7
+; CHECK-NEXT:    add a0, a0, a6
 ; CHECK-NEXT:    ret
   %e0 = extractelement <8 x i8> %v, i32 0
   %e1 = extractelement <8 x i8> %v, i32 1
@@ -79,7 +79,7 @@ define i8 @explode_8xi8(<8 x i8> %v) {
   %e6 = extractelement <8 x i8> %v, i32 6
   %e7 = extractelement <8 x i8> %v, i32 7
   %add0 = add i8 %e0, %e1
-  %add1 = add i8 %add0, %e2
+  %add1 = xor i8 %add0, %e2
   %add2 = add i8 %add1, %e3
   %add3 = add i8 %add2, %e4
   %add4 = add i8 %add3, %e5
@@ -128,20 +128,20 @@ define i8 @explode_16xi8(<16 x i8> %v) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 15
 ; RV32-NEXT:    vmv.x.s s0, v8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
-; RV32-NEXT:    add a7, a7, t2
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t3, t3, t4
-; RV32-NEXT:    add t3, t3, t5
-; RV32-NEXT:    add t3, t3, t6
-; RV32-NEXT:    add t3, t3, s0
-; RV32-NEXT:    add a0, a0, t3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a3, a3, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, a6, t0
+; RV32-NEXT:    add a6, a6, t1
+; RV32-NEXT:    add a0, a0, a6
+; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    add t2, t2, t5
+; RV32-NEXT:    add t2, t2, t6
+; RV32-NEXT:    add a0, a0, t2
+; RV32-NEXT:    add a0, a0, s0
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -185,20 +185,20 @@ define i8 @explode_16xi8(<16 x i8> %v) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 15
 ; RV64-NEXT:    vmv.x.s s0, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a7, a7, t2
-; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t3, t3, t4
-; RV64-NEXT:    add t3, t3, t5
-; RV64-NEXT:    add t3, t3, t6
-; RV64-NEXT:    add t3, t3, s0
-; RV64-NEXT:    add a0, a0, t3
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, a6, t0
+; RV64-NEXT:    add a6, a6, t1
+; RV64-NEXT:    add a0, a0, a6
+; RV64-NEXT:    add t2, t2, t3
+; RV64-NEXT:    add t2, t2, t4
+; RV64-NEXT:    add t2, t2, t5
+; RV64-NEXT:    add t2, t2, t6
+; RV64-NEXT:    add a0, a0, t2
+; RV64-NEXT:    add a0, a0, s0
 ; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -219,7 +219,7 @@ define i8 @explode_16xi8(<16 x i8> %v) {
   %e14 = extractelement <16 x i8> %v, i32 14
   %e15 = extractelement <16 x i8> %v, i32 15
   %add0 = add i8 %e0, %e1
-  %add1 = add i8 %add0, %e2
+  %add1 = xor i8 %add0, %e2
   %add2 = add i8 %add1, %e3
   %add3 = add i8 %add2, %e4
   %add4 = add i8 %add3, %e5
@@ -263,15 +263,15 @@ define i16 @explode_4xi16(<4 x i16> %v) {
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 3
 ; CHECK-NEXT:    vmv.x.s a3, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    ret
   %e0 = extractelement <4 x i16> %v, i32 0
   %e1 = extractelement <4 x i16> %v, i32 1
   %e2 = extractelement <4 x i16> %v, i32 2
   %e3 = extractelement <4 x i16> %v, i32 3
   %add0 = add i16 %e0, %e1
-  %add1 = add i16 %add0, %e2
+  %add1 = xor i16 %add0, %e2
   %add2 = add i16 %add1, %e3
   ret i16 %add2
 }
@@ -297,12 +297,12 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vmv.x.s a7, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, a4, a6
-; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    add a0, a0, a7
+; CHECK-NEXT:    xor a0, a0, a2
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    add a3, a3, a5
+; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    add a6, a6, a7
+; CHECK-NEXT:    add a0, a0, a6
 ; CHECK-NEXT:    ret
   %e0 = extractelement <8 x i16> %v, i32 0
   %e1 = extractelement <8 x i16> %v, i32 1
@@ -313,7 +313,7 @@ define i16 @explode_8xi16(<8 x i16> %v) {
   %e6 = extractelement <8 x i16> %v, i32 6
   %e7 = extractelement <8 x i16> %v, i32 7
   %add0 = add i16 %e0, %e1
-  %add1 = add i16 %add0, %e2
+  %add1 = xor i16 %add0, %e2
   %add2 = add i16 %add1, %e3
   %add3 = add i16 %add2, %e4
   %add4 = add i16 %add3, %e5
@@ -363,20 +363,20 @@ define i16 @explode_16xi16(<16 x i16> %v) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 15
 ; RV32-NEXT:    vmv.x.s s0, v8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
-; RV32-NEXT:    add a7, a7, t2
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t3, t3, t4
-; RV32-NEXT:    add t3, t3, t5
-; RV32-NEXT:    add t3, t3, t6
-; RV32-NEXT:    add t3, t3, s0
-; RV32-NEXT:    add a0, a0, t3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a3, a3, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, a6, t0
+; RV32-NEXT:    add a6, a6, t1
+; RV32-NEXT:    add a0, a0, a6
+; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    add t2, t2, t5
+; RV32-NEXT:    add t2, t2, t6
+; RV32-NEXT:    add a0, a0, t2
+; RV32-NEXT:    add a0, a0, s0
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -421,20 +421,20 @@ define i16 @explode_16xi16(<16 x i16> %v) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 15
 ; RV64-NEXT:    vmv.x.s s0, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a7, a7, t2
-; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t3, t3, t4
-; RV64-NEXT:    add t3, t3, t5
-; RV64-NEXT:    add t3, t3, t6
-; RV64-NEXT:    add t3, t3, s0
-; RV64-NEXT:    add a0, a0, t3
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, a6, t0
+; RV64-NEXT:    add a6, a6, t1
+; RV64-NEXT:    add a0, a0, a6
+; RV64-NEXT:    add t2, t2, t3
+; RV64-NEXT:    add t2, t2, t4
+; RV64-NEXT:    add t2, t2, t5
+; RV64-NEXT:    add t2, t2, t6
+; RV64-NEXT:    add a0, a0, t2
+; RV64-NEXT:    add a0, a0, s0
 ; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -455,7 +455,7 @@ define i16 @explode_16xi16(<16 x i16> %v) {
   %e14 = extractelement <16 x i16> %v, i32 14
   %e15 = extractelement <16 x i16> %v, i32 15
   %add0 = add i16 %e0, %e1
-  %add1 = add i16 %add0, %e2
+  %add1 = xor i16 %add0, %e2
   %add2 = add i16 %add1, %e3
   %add3 = add i16 %add2, %e4
   %add4 = add i16 %add3, %e5
@@ -508,8 +508,8 @@ define i32 @explode_4xi32(<4 x i32> %v) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vmv.x.s a3, v8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    add a0, a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_4xi32:
@@ -523,15 +523,15 @@ define i32 @explode_4xi32(<4 x i32> %v) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vmv.x.s a3, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    addw a0, a0, a2
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    addw a0, a0, a3
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
   %e2 = extractelement <4 x i32> %v, i32 2
   %e3 = extractelement <4 x i32> %v, i32 3
   %add0 = add i32 %e0, %e1
-  %add1 = add i32 %add0, %e2
+  %add1 = xor i32 %add0, %e2
   %add2 = add i32 %add1, %e3
   ret i32 %add2
 }
@@ -558,12 +558,12 @@ define i32 @explode_8xi32(<8 x i32> %v) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32-NEXT:    vmv.x.s a7, v8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a3, a3, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a0, a0, a6
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_8xi32:
@@ -586,12 +586,12 @@ define i32 @explode_8xi32(<8 x i32> %v) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64-NEXT:    vmv.x.s a7, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    addw a0, a0, a7
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    addw a0, a0, a6
 ; RV64-NEXT:    ret
   %e0 = extractelement <8 x i32> %v, i32 0
   %e1 = extractelement <8 x i32> %v, i32 1
@@ -602,7 +602,7 @@ define i32 @explode_8xi32(<8 x i32> %v) {
   %e6 = extractelement <8 x i32> %v, i32 6
   %e7 = extractelement <8 x i32> %v, i32 7
   %add0 = add i32 %e0, %e1
-  %add1 = add i32 %add0, %e2
+  %add1 = xor i32 %add0, %e2
   %add2 = add i32 %add1, %e3
   %add3 = add i32 %add2, %e4
   %add4 = add i32 %add3, %e5
@@ -654,13 +654,13 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV32-NEXT:    lw t6, 56(sp)
 ; RV32-NEXT:    lw s2, 60(sp)
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a3, a3, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a6, a6, t0
+; RV32-NEXT:    add a0, a0, a6
 ; RV32-NEXT:    add t1, t1, t2
 ; RV32-NEXT:    add t1, t1, t3
 ; RV32-NEXT:    add a0, a0, t1
@@ -717,13 +717,13 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV64-NEXT:    lw t6, 56(sp)
 ; RV64-NEXT:    lw s2, 60(sp)
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, a6, t0
+; RV64-NEXT:    add a0, a0, a6
 ; RV64-NEXT:    add t1, t1, t2
 ; RV64-NEXT:    add t1, t1, t3
 ; RV64-NEXT:    add a0, a0, t1
@@ -754,7 +754,7 @@ define i32 @explode_16xi32(<16 x i32> %v) {
   %e14 = extractelement <16 x i32> %v, i32 14
   %e15 = extractelement <16 x i32> %v, i32 15
   %add0 = add i32 %e0, %e1
-  %add1 = add i32 %add0, %e2
+  %add1 = xor i32 %add0, %e2
   %add2 = add i32 %add1, %e3
   %add3 = add i32 %add2, %e4
   %add4 = add i32 %add3, %e5
@@ -821,19 +821,17 @@ define i64 @explode_4xi64(<4 x i64> %v) {
 ; RV32-NEXT:    vmv.x.s a6, v10
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    vmv.x.s a7, v8
+; RV32-NEXT:    vmv.x.s a7, v10
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    add a1, a1, a3
 ; RV32-NEXT:    add a4, a2, a4
 ; RV32-NEXT:    sltu a2, a4, a2
 ; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a2, a6, a4
-; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    xor a2, a4, a6
 ; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    add a1, a1, a0
-; RV32-NEXT:    add a0, a6, a7
-; RV32-NEXT:    sltu a2, a0, a6
+; RV32-NEXT:    sltu a2, a0, a2
+; RV32-NEXT:    add a1, a1, a7
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    ret
 ;
@@ -849,15 +847,15 @@ define i64 @explode_4xi64(<4 x i64> %v) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vmv.x.s a3, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a0, a0, a3
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i64> %v, i32 0
   %e1 = extractelement <4 x i64> %v, i32 1
   %e2 = extractelement <4 x i64> %v, i32 2
   %e3 = extractelement <4 x i64> %v, i32 3
   %add0 = add i64 %e0, %e1
-  %add1 = add i64 %add0, %e2
+  %add1 = xor i64 %add0, %e2
   %add2 = add i64 %add1, %e3
   ret i64 %add2
 }
@@ -907,13 +905,11 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV32-NEXT:    add a4, a2, a4
 ; RV32-NEXT:    sltu a2, a4, a2
 ; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a2, a6, a4
-; RV32-NEXT:    add a1, a1, a5
-; RV32-NEXT:    add a2, a2, a7
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a2, t0, a6
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    xor a2, a4, a6
+; RV32-NEXT:    add t0, a2, t0
+; RV32-NEXT:    sltu a2, t0, a2
+; RV32-NEXT:    add a1, a1, a7
 ; RV32-NEXT:    add a2, a2, t1
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add t2, t0, t2
@@ -963,12 +959,12 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV64-NEXT:    ld a6, 48(sp)
 ; RV64-NEXT:    ld a7, 56(sp)
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a0, a0, a6
 ; RV64-NEXT:    addi sp, s0, -128
 ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
@@ -983,7 +979,7 @@ define i64 @explode_8xi64(<8 x i64> %v) {
   %e6 = extractelement <8 x i64> %v, i32 6
   %e7 = extractelement <8 x i64> %v, i32 7
   %add0 = add i64 %e0, %e1
-  %add1 = add i64 %add0, %e2
+  %add1 = xor i64 %add0, %e2
   %add2 = add i64 %add1, %e3
   %add3 = add i64 %add2, %e4
   %add4 = add i64 %add3, %e5
@@ -1031,113 +1027,111 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vslidedown.vi v16, v8, 1
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s11, v24
-; RV32-NEXT:    vmv.x.s ra, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 2
-; RV32-NEXT:    vsrl.vx v24, v16, a1
 ; RV32-NEXT:    vmv.x.s a3, v24
 ; RV32-NEXT:    vmv.x.s a4, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 2
+; RV32-NEXT:    vsrl.vx v24, v16, a1
+; RV32-NEXT:    vmv.x.s a5, v24
+; RV32-NEXT:    vmv.x.s a6, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 3
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s0, v24
-; RV32-NEXT:    vmv.x.s a5, v16
+; RV32-NEXT:    vmv.x.s a7, v24
+; RV32-NEXT:    vmv.x.s t0, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 4
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s1, v24
-; RV32-NEXT:    vmv.x.s a6, v16
+; RV32-NEXT:    vmv.x.s s3, v24
+; RV32-NEXT:    vmv.x.s t1, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 5
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s2, v24
-; RV32-NEXT:    vmv.x.s a7, v16
+; RV32-NEXT:    vmv.x.s s4, v24
+; RV32-NEXT:    vmv.x.s t2, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 6
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s3, v24
-; RV32-NEXT:    vmv.x.s t0, v16
+; RV32-NEXT:    vmv.x.s s5, v24
+; RV32-NEXT:    vmv.x.s t3, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 7
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s4, v24
-; RV32-NEXT:    vmv.x.s t1, v16
+; RV32-NEXT:    vmv.x.s s6, v24
+; RV32-NEXT:    vmv.x.s t4, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 8
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s5, v24
-; RV32-NEXT:    vmv.x.s t2, v16
+; RV32-NEXT:    vmv.x.s s7, v24
+; RV32-NEXT:    vmv.x.s t5, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 9
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s6, v24
-; RV32-NEXT:    vmv.x.s t3, v16
+; RV32-NEXT:    vmv.x.s s8, v24
+; RV32-NEXT:    vmv.x.s t6, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 10
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s7, v24
-; RV32-NEXT:    vmv.x.s t4, v16
+; RV32-NEXT:    vmv.x.s s9, v24
+; RV32-NEXT:    vmv.x.s s0, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 11
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s8, v24
-; RV32-NEXT:    vmv.x.s t5, v16
+; RV32-NEXT:    vmv.x.s s10, v24
+; RV32-NEXT:    vmv.x.s s1, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 12
 ; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s9, v24
-; RV32-NEXT:    vmv.x.s t6, v16
+; RV32-NEXT:    vmv.x.s s11, v24
+; RV32-NEXT:    vmv.x.s s2, v16
 ; RV32-NEXT:    vslidedown.vi v24, v8, 13
 ; RV32-NEXT:    vsrl.vx v16, v24, a1
-; RV32-NEXT:    vmv.x.s s10, v16
+; RV32-NEXT:    vmv.x.s ra, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 14
 ; RV32-NEXT:    vsrl.vx v0, v16, a1
 ; RV32-NEXT:    vslidedown.vi v8, v8, 15
 ; RV32-NEXT:    vmv.x.s a2, v24
 ; RV32-NEXT:    vsrl.vx v24, v8, a1
 ; RV32-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add s11, a1, s11
-; RV32-NEXT:    add ra, a0, ra
-; RV32-NEXT:    sltu a0, ra, a0
-; RV32-NEXT:    add a0, s11, a0
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a4, ra, a4
-; RV32-NEXT:    sltu a1, a4, ra
-; RV32-NEXT:    add a1, a1, s0
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a5, a4, a5
-; RV32-NEXT:    sltu a1, a5, a4
-; RV32-NEXT:    add a1, a1, s1
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, a5, a6
-; RV32-NEXT:    sltu a1, a6, a5
-; RV32-NEXT:    add a1, a1, s2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a7, a6, a7
-; RV32-NEXT:    sltu a1, a7, a6
+; RV32-NEXT:    add a3, a1, a3
+; RV32-NEXT:    add a4, a0, a4
+; RV32-NEXT:    sltu a0, a4, a0
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    xor a0, a0, a5
+; RV32-NEXT:    xor a1, a4, a6
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t0, a1, t0
+; RV32-NEXT:    sltu a1, t0, a1
 ; RV32-NEXT:    add a1, a1, s3
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t0, a7, t0
-; RV32-NEXT:    sltu a1, t0, a7
-; RV32-NEXT:    add a1, a1, s4
-; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t1, t0, t1
 ; RV32-NEXT:    sltu a1, t1, t0
-; RV32-NEXT:    add a1, a1, s5
+; RV32-NEXT:    add a1, a1, s4
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t2, t1, t2
 ; RV32-NEXT:    sltu a1, t2, t1
-; RV32-NEXT:    add a1, a1, s6
+; RV32-NEXT:    add a1, a1, s5
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t3, t2, t3
 ; RV32-NEXT:    sltu a1, t3, t2
-; RV32-NEXT:    add a1, a1, s7
+; RV32-NEXT:    add a1, a1, s6
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t4, t3, t4
 ; RV32-NEXT:    sltu a1, t4, t3
-; RV32-NEXT:    add a1, a1, s8
+; RV32-NEXT:    add a1, a1, s7
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t5, t4, t5
 ; RV32-NEXT:    sltu a1, t5, t4
-; RV32-NEXT:    add a1, a1, s9
+; RV32-NEXT:    add a1, a1, s8
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t6, t5, t6
 ; RV32-NEXT:    sltu a1, t6, t5
+; RV32-NEXT:    add a1, a1, s9
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add s0, t6, s0
+; RV32-NEXT:    sltu a1, s0, t6
 ; RV32-NEXT:    add a1, a1, s10
 ; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add s1, s0, s1
+; RV32-NEXT:    sltu a1, s1, s0
+; RV32-NEXT:    add a1, a1, s11
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add s2, s1, s2
+; RV32-NEXT:    sltu a1, s2, s1
+; RV32-NEXT:    add a1, a1, ra
+; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    add a2, t6, a2
-; RV32-NEXT:    sltu a3, a2, t6
+; RV32-NEXT:    add a2, s2, a2
+; RV32-NEXT:    sltu a3, a2, s2
 ; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    vmv.x.s a3, v16
 ; RV32-NEXT:    add a0, a0, a1
@@ -1204,20 +1198,20 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV64-NEXT:    ld t6, 112(sp)
 ; RV64-NEXT:    ld s2, 120(sp)
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t2, t2, t3
-; RV64-NEXT:    add t2, t2, t4
-; RV64-NEXT:    add t2, t2, t5
-; RV64-NEXT:    add a0, a0, t2
-; RV64-NEXT:    add t6, t6, s2
-; RV64-NEXT:    add a0, a0, t6
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    add a6, a6, t0
+; RV64-NEXT:    add a0, a0, a6
+; RV64-NEXT:    add t1, t1, t2
+; RV64-NEXT:    add t1, t1, t3
+; RV64-NEXT:    add t1, t1, t4
+; RV64-NEXT:    add a0, a0, t1
+; RV64-NEXT:    add t5, t5, t6
+; RV64-NEXT:    add t5, t5, s2
+; RV64-NEXT:    add a0, a0, t5
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
@@ -1241,7 +1235,7 @@ define i64 @explode_16xi64(<16 x i64> %v) {
   %e14 = extractelement <16 x i64> %v, i32 14
   %e15 = extractelement <16 x i64> %v, i32 15
   %add0 = add i64 %e0, %e1
-  %add1 = add i64 %add0, %e2
+  %add1 = xor i64 %add0, %e2
   %add2 = add i64 %add1, %e3
   %add3 = add i64 %add2, %e4
   %add4 = add i64 %add3, %e5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
new file mode 100644
index 000000000000000..5ef6b291e309517
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -0,0 +1,1243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define i32 @reduce_sum_2xi32(<2 x i32> %v) {
+; RV32-LABEL: reduce_sum_2xi32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_2xi32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    ret
+  %e0 = extractelement <2 x i32> %v, i32 0
+  %e1 = extractelement <2 x i32> %v, i32 1
+  %add0 = add i32 %e0, %e1
+  ret i32 %add0
+}
+
+define i32 @reduce_sum_4xi32(<4 x i32> %v) {
+; RV32-LABEL: reduce_sum_4xi32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v9, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_4xi32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v9, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v9
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addw a0, a0, a2
+; RV64-NEXT:    ret
+  %e0 = extractelement <4 x i32> %v, i32 0
+  %e1 = extractelement <4 x i32> %v, i32 1
+  %e2 = extractelement <4 x i32> %v, i32 2
+  %e3 = extractelement <4 x i32> %v, i32 3
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  ret i32 %add2
+}
+
+
+define i32 @reduce_sum_8xi32(<8 x i32> %v) {
+; RV32-LABEL: reduce_sum_8xi32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v10
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_8xi32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v10
+; RV64-NEXT:    vslidedown.vi v8, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    addw a0, a0, a7
+; RV64-NEXT:    ret
+  %e0 = extractelement <8 x i32> %v, i32 0
+  %e1 = extractelement <8 x i32> %v, i32 1
+  %e2 = extractelement <8 x i32> %v, i32 2
+  %e3 = extractelement <8 x i32> %v, i32 3
+  %e4 = extractelement <8 x i32> %v, i32 4
+  %e5 = extractelement <8 x i32> %v, i32 5
+  %e6 = extractelement <8 x i32> %v, i32 6
+  %e7 = extractelement <8 x i32> %v, i32 7
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  %add6 = add i32 %add5, %e7
+  ret i32 %add6
+}
+
+define i32 @reduce_sum_16xi32(<16 x i32> %v) {
+; RV32-LABEL: reduce_sum_16xi32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -128
+; RV32-NEXT:    .cfi_def_cfa_offset 128
+; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v12
+; RV32-NEXT:    mv t0, sp
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v8, (t0)
+; RV32-NEXT:    lw t0, 32(sp)
+; RV32-NEXT:    lw t1, 36(sp)
+; RV32-NEXT:    lw t2, 40(sp)
+; RV32-NEXT:    lw t3, 44(sp)
+; RV32-NEXT:    lw t4, 48(sp)
+; RV32-NEXT:    lw t5, 52(sp)
+; RV32-NEXT:    lw t6, 56(sp)
+; RV32-NEXT:    lw s2, 60(sp)
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t1, t1, t2
+; RV32-NEXT:    add t1, t1, t3
+; RV32-NEXT:    add a0, a0, t1
+; RV32-NEXT:    add t4, t4, t5
+; RV32-NEXT:    add t4, t4, t6
+; RV32-NEXT:    add t4, t4, s2
+; RV32-NEXT:    add a0, a0, t4
+; RV32-NEXT:    addi sp, s0, -128
+; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s2, -24
+; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v12
+; RV64-NEXT:    mv t0, sp
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vse32.v v8, (t0)
+; RV64-NEXT:    lw t0, 32(sp)
+; RV64-NEXT:    lw t1, 36(sp)
+; RV64-NEXT:    lw t2, 40(sp)
+; RV64-NEXT:    lw t3, 44(sp)
+; RV64-NEXT:    lw t4, 48(sp)
+; RV64-NEXT:    lw t5, 52(sp)
+; RV64-NEXT:    lw t6, 56(sp)
+; RV64-NEXT:    lw s2, 60(sp)
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t1, t1, t2
+; RV64-NEXT:    add t1, t1, t3
+; RV64-NEXT:    add a0, a0, t1
+; RV64-NEXT:    add t4, t4, t5
+; RV64-NEXT:    add t4, t4, t6
+; RV64-NEXT:    add t4, t4, s2
+; RV64-NEXT:    addw a0, a0, t4
+; RV64-NEXT:    addi sp, s0, -128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    ret
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %e6 = extractelement <16 x i32> %v, i32 6
+  %e7 = extractelement <16 x i32> %v, i32 7
+  %e8 = extractelement <16 x i32> %v, i32 8
+  %e9 = extractelement <16 x i32> %v, i32 9
+  %e10 = extractelement <16 x i32> %v, i32 10
+  %e11 = extractelement <16 x i32> %v, i32 11
+  %e12 = extractelement <16 x i32> %v, i32 12
+  %e13 = extractelement <16 x i32> %v, i32 13
+  %e14 = extractelement <16 x i32> %v, i32 14
+  %e15 = extractelement <16 x i32> %v, i32 15
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  %add6 = add i32 %add5, %e7
+  %add7 = add i32 %add6, %e8
+  %add8 = add i32 %add7, %e9
+  %add9 = add i32 %add8, %e10
+  %add10 = add i32 %add9, %e11
+  %add11 = add i32 %add10, %e12
+  %add12 = add i32 %add11, %e13
+  %add13 = add i32 %add12, %e14
+  %add14 = add i32 %add13, %e15
+  ret i32 %add14
+}
+
+define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %add0 = add i32 %e0, %e1
+  ret i32 %add0
+}
+
+define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v9, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a2
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  ret i32 %add1
+}
+
+define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix4:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v9, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix4:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v9
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addw a0, a0, a2
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  ret i32 %add2
+}
+
+define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix5:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix5:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    addw a0, a0, a4
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  ret i32 %add3
+}
+
+define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix6:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vslidedown.vi v8, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix6:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vslidedown.vi v8, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    addw a0, a0, a4
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  ret i32 %add4
+}
+
+define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vslidedown.vi v8, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vslidedown.vi v8, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    addw a0, a0, a4
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %e6 = extractelement <16 x i32> %v, i32 6
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  ret i32 %add5
+}
+
+define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v10
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v10
+; RV64-NEXT:    vslidedown.vi v8, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v8
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    addw a0, a0, a7
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %e6 = extractelement <16 x i32> %v, i32 6
+  %e7 = extractelement <16 x i32> %v, i32 7
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  %add6 = add i32 %add5, %e7
+  ret i32 %add6
+}
+
+define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix9:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -128
+; RV32-NEXT:    .cfi_def_cfa_offset 128
+; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v12
+; RV32-NEXT:    mv t0, sp
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v8, (t0)
+; RV32-NEXT:    lw t0, 32(sp)
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    addi sp, s0, -128
+; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix9:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v12
+; RV64-NEXT:    mv t0, sp
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vse32.v v8, (t0)
+; RV64-NEXT:    lw t0, 32(sp)
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    addw a0, a0, a7
+; RV64-NEXT:    addi sp, s0, -128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %e6 = extractelement <16 x i32> %v, i32 6
+  %e7 = extractelement <16 x i32> %v, i32 7
+  %e8 = extractelement <16 x i32> %v, i32 8
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  %add6 = add i32 %add5, %e7
+  %add7 = add i32 %add6, %e8
+  ret i32 %add7
+}
+
+define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix13:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -128
+; RV32-NEXT:    .cfi_def_cfa_offset 128
+; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v12
+; RV32-NEXT:    mv t0, sp
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v8, (t0)
+; RV32-NEXT:    lw t0, 32(sp)
+; RV32-NEXT:    lw t1, 36(sp)
+; RV32-NEXT:    lw t2, 40(sp)
+; RV32-NEXT:    lw t3, 44(sp)
+; RV32-NEXT:    lw t4, 48(sp)
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a7, a7, t1
+; RV32-NEXT:    add a7, a7, t2
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    add a0, a0, t3
+; RV32-NEXT:    addi sp, s0, -128
+; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix13:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v12
+; RV64-NEXT:    mv t0, sp
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vse32.v v8, (t0)
+; RV64-NEXT:    lw t0, 32(sp)
+; RV64-NEXT:    lw t1, 36(sp)
+; RV64-NEXT:    lw t2, 40(sp)
+; RV64-NEXT:    lw t3, 44(sp)
+; RV64-NEXT:    lw t4, 48(sp)
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a7, a7, t1
+; RV64-NEXT:    add a7, a7, t2
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    addw a0, a0, t3
+; RV64-NEXT:    addi sp, s0, -128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %e6 = extractelement <16 x i32> %v, i32 6
+  %e7 = extractelement <16 x i32> %v, i32 7
+  %e8 = extractelement <16 x i32> %v, i32 8
+  %e9 = extractelement <16 x i32> %v, i32 9
+  %e10 = extractelement <16 x i32> %v, i32 10
+  %e11 = extractelement <16 x i32> %v, i32 11
+  %e12 = extractelement <16 x i32> %v, i32 12
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  %add6 = add i32 %add5, %e7
+  %add7 = add i32 %add6, %e8
+  %add8 = add i32 %add7, %e9
+  %add9 = add i32 %add8, %e10
+  %add10 = add i32 %add9, %e11
+  %add11 = add i32 %add10, %e12
+  ret i32 %add11
+}
+
+
+define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix14:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -128
+; RV32-NEXT:    .cfi_def_cfa_offset 128
+; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v12
+; RV32-NEXT:    mv t0, sp
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v8, (t0)
+; RV32-NEXT:    lw t0, 32(sp)
+; RV32-NEXT:    lw t1, 36(sp)
+; RV32-NEXT:    lw t2, 40(sp)
+; RV32-NEXT:    lw t3, 44(sp)
+; RV32-NEXT:    lw t4, 48(sp)
+; RV32-NEXT:    lw t5, 52(sp)
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a7, a7, t1
+; RV32-NEXT:    add a7, a7, t2
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    add t3, t3, t5
+; RV32-NEXT:    add a0, a0, t3
+; RV32-NEXT:    addi sp, s0, -128
+; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix14:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v12
+; RV64-NEXT:    mv t0, sp
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vse32.v v8, (t0)
+; RV64-NEXT:    lw t0, 32(sp)
+; RV64-NEXT:    lw t1, 36(sp)
+; RV64-NEXT:    lw t2, 40(sp)
+; RV64-NEXT:    lw t3, 44(sp)
+; RV64-NEXT:    lw t4, 48(sp)
+; RV64-NEXT:    lw t5, 52(sp)
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a7, a7, t1
+; RV64-NEXT:    add a7, a7, t2
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    add t3, t3, t5
+; RV64-NEXT:    addw a0, a0, t3
+; RV64-NEXT:    addi sp, s0, -128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %e6 = extractelement <16 x i32> %v, i32 6
+  %e7 = extractelement <16 x i32> %v, i32 7
+  %e8 = extractelement <16 x i32> %v, i32 8
+  %e9 = extractelement <16 x i32> %v, i32 9
+  %e10 = extractelement <16 x i32> %v, i32 10
+  %e11 = extractelement <16 x i32> %v, i32 11
+  %e12 = extractelement <16 x i32> %v, i32 12
+  %e13 = extractelement <16 x i32> %v, i32 13
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  %add6 = add i32 %add5, %e7
+  %add7 = add i32 %add6, %e8
+  %add8 = add i32 %add7, %e9
+  %add9 = add i32 %add8, %e10
+  %add10 = add i32 %add9, %e11
+  %add11 = add i32 %add10, %e12
+  %add12 = add i32 %add11, %e13
+  ret i32 %add12
+}
+
+define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
+; RV32-LABEL: reduce_sum_16xi32_prefix15:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -128
+; RV32-NEXT:    .cfi_def_cfa_offset 128
+; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v12
+; RV32-NEXT:    mv t0, sp
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v8, (t0)
+; RV32-NEXT:    lw t0, 32(sp)
+; RV32-NEXT:    lw t1, 36(sp)
+; RV32-NEXT:    lw t2, 40(sp)
+; RV32-NEXT:    lw t3, 44(sp)
+; RV32-NEXT:    lw t4, 48(sp)
+; RV32-NEXT:    lw t5, 52(sp)
+; RV32-NEXT:    lw t6, 56(sp)
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a7, a7, t1
+; RV32-NEXT:    add a7, a7, t2
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    add t3, t3, t5
+; RV32-NEXT:    add t3, t3, t6
+; RV32-NEXT:    add a0, a0, t3
+; RV32-NEXT:    addi sp, s0, -128
+; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_16xi32_prefix15:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v12
+; RV64-NEXT:    vslidedown.vi v12, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v12
+; RV64-NEXT:    mv t0, sp
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vse32.v v8, (t0)
+; RV64-NEXT:    lw t0, 32(sp)
+; RV64-NEXT:    lw t1, 36(sp)
+; RV64-NEXT:    lw t2, 40(sp)
+; RV64-NEXT:    lw t3, 44(sp)
+; RV64-NEXT:    lw t4, 48(sp)
+; RV64-NEXT:    lw t5, 52(sp)
+; RV64-NEXT:    lw t6, 56(sp)
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a7, a7, t1
+; RV64-NEXT:    add a7, a7, t2
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    add t3, t3, t5
+; RV64-NEXT:    add t3, t3, t6
+; RV64-NEXT:    addw a0, a0, t3
+; RV64-NEXT:    addi sp, s0, -128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %e5 = extractelement <16 x i32> %v, i32 5
+  %e6 = extractelement <16 x i32> %v, i32 6
+  %e7 = extractelement <16 x i32> %v, i32 7
+  %e8 = extractelement <16 x i32> %v, i32 8
+  %e9 = extractelement <16 x i32> %v, i32 9
+  %e10 = extractelement <16 x i32> %v, i32 10
+  %e11 = extractelement <16 x i32> %v, i32 11
+  %e12 = extractelement <16 x i32> %v, i32 12
+  %e13 = extractelement <16 x i32> %v, i32 13
+  %e14 = extractelement <16 x i32> %v, i32 14
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  %add3 = add i32 %add2, %e4
+  %add4 = add i32 %add3, %e5
+  %add5 = add i32 %add4, %e6
+  %add6 = add i32 %add5, %e7
+  %add7 = add i32 %add6, %e8
+  %add8 = add i32 %add7, %e9
+  %add9 = add i32 %add8, %e10
+  %add10 = add i32 %add9, %e11
+  %add11 = add i32 %add10, %e12
+  %add12 = add i32 %add11, %e13
+  %add13 = add i32 %add12, %e14
+  ret i32 %add13
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}


        


More information about the llvm-commits mailing list