[llvm] 9bd58f6 - [ARM][AArch64][RISCV] Add tests for various double reductions. NFC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 7 08:39:26 PST 2023
Author: David Green
Date: 2023-02-07T16:39:20Z
New Revision: 9bd58f6f051c9b5546bbaa891fa11d3624b8d9a4
URL: https://github.com/llvm/llvm-project/commit/9bd58f6f051c9b5546bbaa891fa11d3624b8d9a4
DIFF: https://github.com/llvm/llvm-project/commit/9bd58f6f051c9b5546bbaa891fa11d3624b8d9a4.diff
LOG: [ARM][AArch64][RISCV] Add tests for various double reductions. NFC
Added:
llvm/test/CodeGen/AArch64/double_reduct.ll
llvm/test/CodeGen/AArch64/sve-doublereduct.ll
llvm/test/CodeGen/RISCV/double_reduct.ll
llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
Modified:
llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
llvm/test/CodeGen/AArch64/vecreduce-add.ll
llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
new file mode 100644
index 0000000000000..1fd1eb6fc5dd4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=aarch64-eabi < %s | FileCheck %s
+
+define float @add_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: add_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s
+; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: faddp s1, v2.2s
+; CHECK-NEXT: faddp s0, v0.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmul_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fmul v1.2s, v2.2s, v3.2s
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: fmul s1, s1, v1.s[1]
+; CHECK-NEXT: fmul v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: fmul s0, s0, v0.s[1]
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
+ %r = fmul fast float %r1, %r2
+ ret float %r
+}
+
+define float @fmin_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmin_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fminnmv s2, v2.4s
+; CHECK-NEXT: fminnmv s0, v0.4s
+; CHECK-NEXT: fminnm s0, s0, s2
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
+ %r = call float @llvm.minnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+define float @fmax_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmax_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fmaxnmv s2, v2.4s
+; CHECK-NEXT: fmaxnmv s0, v0.4s
+; CHECK-NEXT: fmaxnm s0, s0, s2
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
+ %r = call float @llvm.maxnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+
+define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: add_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
+ %r = add i32 %r1, %r2
+ ret i32 %r
+}
+
+define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: add_ext_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: uaddlv h1, v1.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %ae = zext <16 x i8> %a to <16 x i16>
+ %be = zext <16 x i8> %b to <16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: add_ext_v32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddl2 v3.8h, v0.16b, v1.16b
+; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: uaddlv h2, v2.16b
+; CHECK-NEXT: add v0.8h, v0.8h, v3.8h
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %ae = zext <32 x i8> %a to <32 x i16>
+ %be = zext <16 x i8> %b to <16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: mul_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mul v1.2s, v2.2s, v3.2s
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: mov w9, v1.s[1]
+; CHECK-NEXT: fmov w11, s1
+; CHECK-NEXT: mul v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: mul w9, w11, w9
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: mul w8, w10, w8
+; CHECK-NEXT: mul w0, w8, w9
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
+ %r = mul i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: and_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: and v2.8b, v2.8b, v3.8b
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov w8, v2.s[1]
+; CHECK-NEXT: mov w9, v0.s[1]
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: fmov w11, s2
+; CHECK-NEXT: and w9, w10, w9
+; CHECK-NEXT: and w8, w11, w8
+; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
+ %r = and i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: or_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: orr v2.8b, v2.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov w8, v2.s[1]
+; CHECK-NEXT: mov w9, v0.s[1]
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: fmov w11, s2
+; CHECK-NEXT: orr w9, w10, w9
+; CHECK-NEXT: orr w8, w11, w8
+; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
+ %r = or i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: xor_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov w8, v2.s[1]
+; CHECK-NEXT: mov w9, v0.s[1]
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: fmov w11, s2
+; CHECK-NEXT: eor w9, w10, w9
+; CHECK-NEXT: eor w8, w11, w8
+; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
+ %r = xor i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umin_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: uminv s2, v2.4s
+; CHECK-NEXT: uminv s0, v0.4s
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: cmp w9, w8
+; CHECK-NEXT: csel w0, w9, w8, lo
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umax_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: umaxv s2, v2.4s
+; CHECK-NEXT: umaxv s0, v0.4s
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: cmp w9, w8
+; CHECK-NEXT: csel w0, w9, w8, hi
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: smin_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: sminv s2, v2.4s
+; CHECK-NEXT: sminv s0, v0.4s
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: cmp w9, w8
+; CHECK-NEXT: csel w0, w9, w8, lt
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: smax_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: smaxv s2, v2.4s
+; CHECK-NEXT: smaxv s0, v0.4s
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: cmp w9, w8
+; CHECK-NEXT: csel w0, w9, w8, gt
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
new file mode 100644
index 0000000000000..c79c87b295079
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=aarch64-eabi -mattr=+sve2 < %s | FileCheck %s
+
+define float @add_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: add_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fadd z0.s, z0.s, z1.s
+; CHECK-NEXT: faddv s2, p0, z2.s
+; CHECK-NEXT: faddv s0, p0, z0.s
+; CHECK-NEXT: fadd s0, s0, s2
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.nxv8f32(float -0.0, <vscale x 8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.nxv4f32(float -0.0, <vscale x 4 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+;define float @fmul_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
+; %r1 = call fast float @llvm.vector.reduce.fmul.f32.nxv8f32(float 1.0, <vscale x 8 x float> %a)
+; %r2 = call fast float @llvm.vector.reduce.fmul.f32.nxv4f32(float 1.0, <vscale x 4 x float> %b)
+; %r = fmul fast float %r1, %r2
+; ret float %r
+;}
+
+define float @fmin_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmin_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: fminnmv s2, p0, z2.s
+; CHECK-NEXT: fminnmv s0, p0, z0.s
+; CHECK-NEXT: fminnm s0, s0, s2
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %b)
+ %r = call float @llvm.minnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+define float @fmax_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmax_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: fmaxnmv s2, p0, z2.s
+; CHECK-NEXT: fmaxnmv s0, p0, z0.s
+; CHECK-NEXT: fmaxnm s0, s0, s2
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %b)
+ %r = call float @llvm.maxnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+
+define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: add_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: uaddv d2, p0, z2.s
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = add i32 %r1, %r2
+ ret i32 %r
+}
+
+define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: add_ext_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z2.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z3.h, z1.b
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add z0.h, z0.h, z2.h
+; CHECK-NEXT: add z1.h, z1.h, z3.h
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
+ %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: add_ext_v32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z3.h, z1.b
+; CHECK-NEXT: uunpklo z4.h, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z0.h, z0.b
+; CHECK-NEXT: uunpkhi z5.h, z2.b
+; CHECK-NEXT: uunpklo z2.h, z2.b
+; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: add z1.h, z4.h, z3.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NEXT: add z1.h, z2.h, z5.h
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
+ %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.nxv32i16(<vscale x 32 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+;define i32 @mul_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; %r1 = call i32 @llvm.vector.reduce.mul.i32.nxv8i32(<vscale x 8 x i32> %a)
+; %r2 = call i32 @llvm.vector.reduce.mul.i32.nxv4i32(<vscale x 4 x i32> %b)
+; %r = mul i32 %r1, %r2
+; ret i32 %r
+;}
+
+define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: and_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: and z0.d, z0.d, z1.d
+; CHECK-NEXT: andv s2, p0, z2.s
+; CHECK-NEXT: andv s0, p0, z0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: and w0, w8, w9
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.and.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.and.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = and i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @or_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: or_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: orv s2, p0, z2.s
+; CHECK-NEXT: orv s0, p0, z0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: orr w0, w8, w9
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.or.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.or.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = or i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @xor_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: xor_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: eorv s2, p0, z2.s
+; CHECK-NEXT: eorv s0, p0, z0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: eor w0, w8, w9
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.xor.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.xor.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = xor i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @umin_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umin_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uminv s2, p0, z2.s
+; CHECK-NEXT: uminv s0, p0, z0.s
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: csel w0, w8, w9, lo
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.umin.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umin.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @umax_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umax_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: umaxv s2, p0, z2.s
+; CHECK-NEXT: umaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: csel w0, w8, w9, hi
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.umax.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umax.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smin_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smin_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: sminv s2, p0, z2.s
+; CHECK-NEXT: sminv s0, p0, z0.s
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: csel w0, w8, w9, lt
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.smin.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smin.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smax_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smax_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: smaxv s2, p0, z2.s
+; CHECK-NEXT: smaxv s0, p0, z0.s
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: csel w0, w8, w9, gt
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.smax.i32.nxv8i32(<vscale x 8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smax.i32.nxv4i32(<vscale x 4 x i32> %b)
+ %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+declare float @llvm.vector.reduce.fadd.f32.nxv8f32(float, <vscale x 8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.nxv4f32(float, <vscale x 4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.nxv8f32(float, <vscale x 8 x float>)
+declare float @llvm.vector.reduce.fmul.f32.nxv4f32(float, <vscale x 4 x float>)
+declare float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float>)
+declare float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float>)
+declare float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float>)
+declare float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float>)
+declare i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32>)
+declare i16 @llvm.vector.reduce.add.i16.nxv32i16(<vscale x 32 x i16>)
+declare i16 @llvm.vector.reduce.add.i16.nxv16i16(<vscale x 16 x i16>)
+declare i32 @llvm.vector.reduce.mul.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.mul.i32.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.and.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.and.i32.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.or.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.or.i32.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.xor.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.xor.i32.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.umin.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.umin.i32.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.umax.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.umax.i32.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.smin.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.smin.i32.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.smax.i32.nxv8i32(<vscale x 8 x i32>)
+declare i32 @llvm.vector.reduce.smax.i32.nxv4i32(<vscale x 4 x i32>)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
index 6c41f6203e064..0106dc2e7f7f5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
@@ -354,6 +354,21 @@ define double @fminv_nxv2f64(<vscale x 2 x double> %a) {
ret double %res
}
+define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x float> %b) {
+; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fadd z1.s, z1.s, z2.s
+; CHECK-NEXT: faddv s0, p0, z0.s
+; CHECK-NEXT: faddv s1, p0, z1.s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.0, <vscale x 4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.0, <vscale x 8 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
@@ -362,6 +377,7 @@ declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
+declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
declare half @llvm.vector.reduce.fmax.nxv2f16(<vscale x 2 x half>)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 82f45e56f8331..8316c881dd484 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1992,6 +1992,91 @@ entry:
ret i64 %z
}
+define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-BASE: // %bb.0: // %entry
+; CHECK-BASE-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT: saddlp v2.4s, v2.8h
+; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT: sshll v3.8h, v3.8b, #0
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT: sadalp v2.4s, v3.8h
+; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-BASE-NEXT: addv s0, v0.4s
+; CHECK-BASE-NEXT: fmov w0, s0
+; CHECK-BASE-NEXT: ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-DOT: // %bb.0: // %entry
+; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT: movi v5.8b, #1
+; CHECK-DOT-NEXT: movi v6.2d, #0000000000000000
+; CHECK-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b
+; CHECK-DOT-NEXT: udot v6.2s, v1.8b, v5.8b
+; CHECK-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b
+; CHECK-DOT-NEXT: udot v6.2s, v0.8b, v5.8b
+; CHECK-DOT-NEXT: add v0.2s, v6.2s, v4.2s
+; CHECK-DOT-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-DOT-NEXT: fmov w0, s0
+; CHECK-DOT-NEXT: ret
+entry:
+ %axx = zext <8 x i8> %ax to <8 x i32>
+ %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
+ %ayy = zext <8 x i8> %ay to <8 x i32>
+ %az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy)
+ %az = add i32 %az1, %az2
+ %bxx = sext <8 x i8> %bx to <8 x i32>
+ %bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx)
+ %byy = sext <8 x i8> %by to <8 x i32>
+ %bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy)
+ %bz = add i32 %bz1, %bz2
+ %z = add i32 %az, %bz
+ ret i32 %z
+}
+
+define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
+; CHECK-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ushll v5.4s, v0.4h, #0
+; CHECK-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-NEXT: ushll v6.4s, v1.4h, #0
+; CHECK-NEXT: uaddw2 v0.4s, v5.4s, v0.8h
+; CHECK-NEXT: ushll v5.4s, v3.4h, #0
+; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v1.8h
+; CHECK-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
+; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v3.8h
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %axx = zext <8 x i16> %ax to <8 x i32>
+ %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %axs = add <4 x i32> %s1h, %s1l
+ %ayy = zext <8 x i16> %ay to <8 x i32>
+ %s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ays = add <4 x i32> %s2h, %s2l
+ %az = add <4 x i32> %axs, %ays
+ %bxx = zext <8 x i16> %bx to <8 x i32>
+ %s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %bxs = add <4 x i32> %s3h, %s3l
+ %byy = zext <8 x i16> %by to <8 x i32>
+ %s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %bys = add <4 x i32> %s4h, %s4l
+ %bz = add <4 x i32> %bxs, %bys
+ %z = add <4 x i32> %az, %bz
+ %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
+ ret i32 %z2
+}
+
define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: add_pair_v2i64_v2i64:
; CHECK: // %bb.0: // %entry
@@ -2006,6 +2091,200 @@ entry:
ret i64 %z
}
+define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
+; CHECK-BASE-LABEL: full:
+; CHECK-BASE: // %bb.0: // %entry
+; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3
+; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-BASE-NEXT: sxtw x8, w1
+; CHECK-BASE-NEXT: sxtw x9, w3
+; CHECK-BASE-NEXT: add x10, x0, x8
+; CHECK-BASE-NEXT: add x11, x2, x9
+; CHECK-BASE-NEXT: ldr d2, [x0]
+; CHECK-BASE-NEXT: ldr d3, [x2]
+; CHECK-BASE-NEXT: ldr d0, [x10]
+; CHECK-BASE-NEXT: add x10, x10, x8
+; CHECK-BASE-NEXT: ldr d1, [x11]
+; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b
+; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
+; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d3, [x11]
+; CHECK-BASE-NEXT: add x10, x10, x8
+; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
+; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d3, [x11]
+; CHECK-BASE-NEXT: add x10, x10, x8
+; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
+; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d3, [x11]
+; CHECK-BASE-NEXT: add x10, x10, x8
+; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
+; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d3, [x11]
+; CHECK-BASE-NEXT: add x10, x10, x8
+; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
+; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d3, [x11]
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT: ldr d1, [x10, x8]
+; CHECK-BASE-NEXT: uabdl v2.8h, v2.8b, v3.8b
+; CHECK-BASE-NEXT: ldr d3, [x11, x9]
+; CHECK-BASE-NEXT: uadalp v0.4s, v2.8h
+; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v3.8b
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT: addv s0, v0.4s
+; CHECK-BASE-NEXT: fmov w0, s0
+; CHECK-BASE-NEXT: ret
+;
+; CHECK-DOT-LABEL: full:
+; CHECK-DOT: // %bb.0: // %entry
+; CHECK-DOT-NEXT: // kill: def $w3 killed $w3 def $x3
+; CHECK-DOT-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-DOT-NEXT: sxtw x8, w3
+; CHECK-DOT-NEXT: sxtw x9, w1
+; CHECK-DOT-NEXT: ldr d0, [x0]
+; CHECK-DOT-NEXT: add x10, x0, x9
+; CHECK-DOT-NEXT: ldr d1, [x2]
+; CHECK-DOT-NEXT: add x11, x2, x8
+; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT: movi v3.8b, #1
+; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT: ldr d1, [x10]
+; CHECK-DOT-NEXT: ldr d4, [x11]
+; CHECK-DOT-NEXT: add x10, x10, x9
+; CHECK-DOT-NEXT: add x11, x11, x8
+; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT: ldr d1, [x10]
+; CHECK-DOT-NEXT: ldr d4, [x11]
+; CHECK-DOT-NEXT: add x10, x10, x9
+; CHECK-DOT-NEXT: add x11, x11, x8
+; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT: ldr d1, [x10]
+; CHECK-DOT-NEXT: ldr d4, [x11]
+; CHECK-DOT-NEXT: add x10, x10, x9
+; CHECK-DOT-NEXT: add x11, x11, x8
+; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT: ldr d1, [x10]
+; CHECK-DOT-NEXT: ldr d4, [x11]
+; CHECK-DOT-NEXT: add x10, x10, x9
+; CHECK-DOT-NEXT: add x11, x11, x8
+; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT: ldr d1, [x10]
+; CHECK-DOT-NEXT: ldr d4, [x11]
+; CHECK-DOT-NEXT: add x10, x10, x9
+; CHECK-DOT-NEXT: add x11, x11, x8
+; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT: ldr d1, [x10]
+; CHECK-DOT-NEXT: ldr d4, [x11]
+; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT: ldr d0, [x10, x9]
+; CHECK-DOT-NEXT: uabd v1.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT: ldr d4, [x11, x8]
+; CHECK-DOT-NEXT: udot v2.2s, v1.8b, v3.8b
+; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v4.8b
+; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT: fmov w0, s0
+; CHECK-DOT-NEXT: ret
+entry:
+ %idx.ext8 = sext i32 %s2 to i64
+ %idx.ext = sext i32 %s1 to i64
+ %0 = load <8 x i8>, ptr %p1, align 1
+ %1 = zext <8 x i8> %0 to <8 x i32>
+ %2 = load <8 x i8>, ptr %p2, align 1
+ %3 = zext <8 x i8> %2 to <8 x i32>
+ %4 = sub nsw <8 x i32> %1, %3
+ %5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true)
+ %6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
+ %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext
+ %add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8
+ %7 = load <8 x i8>, ptr %add.ptr, align 1
+ %8 = zext <8 x i8> %7 to <8 x i32>
+ %9 = load <8 x i8>, ptr %add.ptr9, align 1
+ %10 = zext <8 x i8> %9 to <8 x i32>
+ %11 = sub nsw <8 x i32> %8, %10
+ %12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true)
+ %13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
+ %op.rdx.1 = add i32 %13, %6
+ %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
+ %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
+ %14 = load <8 x i8>, ptr %add.ptr.1, align 1
+ %15 = zext <8 x i8> %14 to <8 x i32>
+ %16 = load <8 x i8>, ptr %add.ptr9.1, align 1
+ %17 = zext <8 x i8> %16 to <8 x i32>
+ %18 = sub nsw <8 x i32> %15, %17
+ %19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true)
+ %20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
+ %op.rdx.2 = add i32 %20, %op.rdx.1
+ %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
+ %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
+ %21 = load <8 x i8>, ptr %add.ptr.2, align 1
+ %22 = zext <8 x i8> %21 to <8 x i32>
+ %23 = load <8 x i8>, ptr %add.ptr9.2, align 1
+ %24 = zext <8 x i8> %23 to <8 x i32>
+ %25 = sub nsw <8 x i32> %22, %24
+ %26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true)
+ %27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26)
+ %op.rdx.3 = add i32 %27, %op.rdx.2
+ %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext
+ %add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8
+ %28 = load <8 x i8>, ptr %add.ptr.3, align 1
+ %29 = zext <8 x i8> %28 to <8 x i32>
+ %30 = load <8 x i8>, ptr %add.ptr9.3, align 1
+ %31 = zext <8 x i8> %30 to <8 x i32>
+ %32 = sub nsw <8 x i32> %29, %31
+ %33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true)
+ %34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33)
+ %op.rdx.4 = add i32 %34, %op.rdx.3
+ %add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext
+ %add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8
+ %35 = load <8 x i8>, ptr %add.ptr.4, align 1
+ %36 = zext <8 x i8> %35 to <8 x i32>
+ %37 = load <8 x i8>, ptr %add.ptr9.4, align 1
+ %38 = zext <8 x i8> %37 to <8 x i32>
+ %39 = sub nsw <8 x i32> %36, %38
+ %40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true)
+ %41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
+ %op.rdx.5 = add i32 %41, %op.rdx.4
+ %add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext
+ %add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8
+ %42 = load <8 x i8>, ptr %add.ptr.5, align 1
+ %43 = zext <8 x i8> %42 to <8 x i32>
+ %44 = load <8 x i8>, ptr %add.ptr9.5, align 1
+ %45 = zext <8 x i8> %44 to <8 x i32>
+ %46 = sub nsw <8 x i32> %43, %45
+ %47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true)
+ %48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
+ %op.rdx.6 = add i32 %48, %op.rdx.5
+ %add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext
+ %add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8
+ %49 = load <8 x i8>, ptr %add.ptr.6, align 1
+ %50 = zext <8 x i8> %49 to <8 x i32>
+ %51 = load <8 x i8>, ptr %add.ptr9.6, align 1
+ %52 = zext <8 x i8> %51 to <8 x i32>
+ %53 = sub nsw <8 x i32> %50, %52
+ %54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true)
+ %55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54)
+ %op.rdx.7 = add i32 %55, %op.rdx.6
+ ret i32 %op.rdx.7
+}
+
+declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 83689f2625cf4..452fc36571ef3 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -434,6 +434,198 @@ exit:
ret half %red.next
}
+
+define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
+; FULLFP16-LABEL: fadd_reduct_reassoc_v8f16:
+; FULLFP16: // %bb.0:
+; FULLFP16-NEXT: faddp v2.8h, v0.8h, v0.8h
+; FULLFP16-NEXT: faddp v3.8h, v1.8h, v1.8h
+; FULLFP16-NEXT: faddp v0.8h, v2.8h, v0.8h
+; FULLFP16-NEXT: faddp v1.8h, v3.8h, v1.8h
+; FULLFP16-NEXT: faddp h0, v0.2h
+; FULLFP16-NEXT: faddp h1, v1.2h
+; FULLFP16-NEXT: fadd h0, h0, h1
+; FULLFP16-NEXT: ret
+;
+; CHECKNOFP16-LABEL: fadd_reduct_reassoc_v8f16:
+; CHECKNOFP16: // %bb.0:
+; CHECKNOFP16-NEXT: mov h2, v0.h[1]
+; CHECKNOFP16-NEXT: mov h3, v1.h[1]
+; CHECKNOFP16-NEXT: fcvt s4, h0
+; CHECKNOFP16-NEXT: fcvt s5, h1
+; CHECKNOFP16-NEXT: fcvt s2, h2
+; CHECKNOFP16-NEXT: fcvt s3, h3
+; CHECKNOFP16-NEXT: fadd s2, s4, s2
+; CHECKNOFP16-NEXT: fadd s3, s5, s3
+; CHECKNOFP16-NEXT: mov h4, v0.h[2]
+; CHECKNOFP16-NEXT: mov h5, v1.h[2]
+; CHECKNOFP16-NEXT: fcvt h2, s2
+; CHECKNOFP16-NEXT: fcvt h3, s3
+; CHECKNOFP16-NEXT: fcvt s4, h4
+; CHECKNOFP16-NEXT: fcvt s5, h5
+; CHECKNOFP16-NEXT: fcvt s2, h2
+; CHECKNOFP16-NEXT: fcvt s3, h3
+; CHECKNOFP16-NEXT: fadd s2, s2, s4
+; CHECKNOFP16-NEXT: fadd s3, s3, s5
+; CHECKNOFP16-NEXT: mov h4, v0.h[3]
+; CHECKNOFP16-NEXT: mov h5, v1.h[3]
+; CHECKNOFP16-NEXT: fcvt h2, s2
+; CHECKNOFP16-NEXT: fcvt h3, s3
+; CHECKNOFP16-NEXT: fcvt s4, h4
+; CHECKNOFP16-NEXT: fcvt s5, h5
+; CHECKNOFP16-NEXT: fcvt s2, h2
+; CHECKNOFP16-NEXT: fcvt s3, h3
+; CHECKNOFP16-NEXT: fadd s2, s2, s4
+; CHECKNOFP16-NEXT: fadd s3, s3, s5
+; CHECKNOFP16-NEXT: mov h4, v0.h[4]
+; CHECKNOFP16-NEXT: mov h5, v1.h[4]
+; CHECKNOFP16-NEXT: fcvt h2, s2
+; CHECKNOFP16-NEXT: fcvt h3, s3
+; CHECKNOFP16-NEXT: fcvt s4, h4
+; CHECKNOFP16-NEXT: fcvt s5, h5
+; CHECKNOFP16-NEXT: fcvt s2, h2
+; CHECKNOFP16-NEXT: fcvt s3, h3
+; CHECKNOFP16-NEXT: fadd s2, s2, s4
+; CHECKNOFP16-NEXT: fadd s3, s3, s5
+; CHECKNOFP16-NEXT: mov h4, v0.h[5]
+; CHECKNOFP16-NEXT: mov h5, v1.h[5]
+; CHECKNOFP16-NEXT: fcvt h2, s2
+; CHECKNOFP16-NEXT: fcvt h3, s3
+; CHECKNOFP16-NEXT: fcvt s4, h4
+; CHECKNOFP16-NEXT: fcvt s5, h5
+; CHECKNOFP16-NEXT: fcvt s2, h2
+; CHECKNOFP16-NEXT: fcvt s3, h3
+; CHECKNOFP16-NEXT: fadd s2, s2, s4
+; CHECKNOFP16-NEXT: fadd s3, s3, s5
+; CHECKNOFP16-NEXT: mov h4, v0.h[6]
+; CHECKNOFP16-NEXT: mov h5, v1.h[6]
+; CHECKNOFP16-NEXT: mov h0, v0.h[7]
+; CHECKNOFP16-NEXT: mov h1, v1.h[7]
+; CHECKNOFP16-NEXT: fcvt h2, s2
+; CHECKNOFP16-NEXT: fcvt h3, s3
+; CHECKNOFP16-NEXT: fcvt s4, h4
+; CHECKNOFP16-NEXT: fcvt s5, h5
+; CHECKNOFP16-NEXT: fcvt s0, h0
+; CHECKNOFP16-NEXT: fcvt s1, h1
+; CHECKNOFP16-NEXT: fcvt s2, h2
+; CHECKNOFP16-NEXT: fcvt s3, h3
+; CHECKNOFP16-NEXT: fadd s2, s2, s4
+; CHECKNOFP16-NEXT: fadd s3, s3, s5
+; CHECKNOFP16-NEXT: fcvt h2, s2
+; CHECKNOFP16-NEXT: fcvt h3, s3
+; CHECKNOFP16-NEXT: fcvt s2, h2
+; CHECKNOFP16-NEXT: fcvt s3, h3
+; CHECKNOFP16-NEXT: fadd s0, s2, s0
+; CHECKNOFP16-NEXT: fadd s1, s3, s1
+; CHECKNOFP16-NEXT: fcvt h0, s0
+; CHECKNOFP16-NEXT: fcvt h1, s1
+; CHECKNOFP16-NEXT: fcvt s1, h1
+; CHECKNOFP16-NEXT: fcvt s0, h0
+; CHECKNOFP16-NEXT: fadd s0, s0, s1
+; CHECKNOFP16-NEXT: fcvt h0, s0
+; CHECKNOFP16-NEXT: ret
+ %r1 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %a)
+ %r2 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %b)
+ %r = fadd fast half %r1, %r2
+ ret half %r
+}
+
+define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: fadd_reduct_reassoc_v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: faddp v1.4s, v2.4s, v2.4s
+; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: faddp s1, v1.2s
+; CHECK-NEXT: faddp s0, v0.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fadd_reduct_reassoc_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: faddp s0, v0.2s
+; CHECK-NEXT: faddp s1, v1.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+define float @fadd_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fadd_reduct_reassoc_v4f32_init:
+; CHECK: // %bb.0:
+; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s
+; CHECK-NEXT: faddp s1, v1.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: faddp s1, v2.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %i, <4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: faddp s0, v0.2s
+; CHECK-NEXT: faddp s1, v1.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: fadd_reduct_reassoc_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v2.2d, v2.2d, v3.2d
+; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: faddp d1, v2.2d
+; CHECK-NEXT: faddp d0, v0.2d
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
+ %r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a)
+ %r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b)
+ %r = fadd fast double %r1, %r2
+ ret double %r
+}
+
+define float @fadd_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fadd_reduct_reassoc_v4f32_extrause:
+; CHECK: // %bb.0:
+; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: faddp s0, v0.2s
+; CHECK-NEXT: faddp s1, v1.2s
+; CHECK-NEXT: fadd s1, s0, s1
+; CHECK-NEXT: fmul s0, s1, s0
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
+ %r = fadd fast float %r1, %r2
+ %p = fmul float %r, %r1
+ ret float %p
+}
+
; Function Attrs: nounwind readnone
declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll
new file mode 100644
index 0000000000000..bd910f1230a71
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/double_reduct.ll
@@ -0,0 +1,369 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define float @add_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: add_f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vfredusum.vs v8, v8, v10
+; CHECK-NEXT: vfmv.f.s ft0, v8
+; CHECK-NEXT: vfredusum.vs v8, v9, v10
+; CHECK-NEXT: vfmv.f.s ft1, v8
+; CHECK-NEXT: fadd.s fa0, ft0, ft1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+define float @fmul_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmul_f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 2
+; CHECK-NEXT: vfmul.vv v8, v8, v10
+; CHECK-NEXT: vrgather.vi v10, v8, 1
+; CHECK-NEXT: vfmul.vv v8, v8, v10
+; CHECK-NEXT: vfmv.f.s ft0, v8
+; CHECK-NEXT: vslidedown.vi v8, v9, 2
+; CHECK-NEXT: vfmul.vv v8, v9, v8
+; CHECK-NEXT: vrgather.vi v9, v8, 1
+; CHECK-NEXT: vfmul.vv v8, v8, v9
+; CHECK-NEXT: vfmv.f.s ft1, v8
+; CHECK-NEXT: fmul.s fa0, ft0, ft1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
+ %r = fmul fast float %r1, %r2
+ ret float %r
+}
+
+define float @fmin_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmin_f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
+; CHECK-NEXT: flw ft0, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vfmv.s.f v10, ft0
+; CHECK-NEXT: vfredmin.vs v8, v8, v10
+; CHECK-NEXT: vfmv.f.s ft0, v8
+; CHECK-NEXT: vfredmin.vs v8, v9, v10
+; CHECK-NEXT: vfmv.f.s ft1, v8
+; CHECK-NEXT: fmin.s fa0, ft0, ft1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
+ %r = call float @llvm.minnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+define float @fmax_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmax_f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
+; CHECK-NEXT: flw ft0, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vfmv.s.f v10, ft0
+; CHECK-NEXT: vfredmax.vs v8, v8, v10
+; CHECK-NEXT: vfmv.f.s ft0, v8
+; CHECK-NEXT: vfredmax.vs v8, v9, v10
+; CHECK-NEXT: vfmv.f.s ft1, v8
+; CHECK-NEXT: fmax.s fa0, ft0, ft1
+; CHECK-NEXT: ret
+ %r1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
+ %r = call float @llvm.maxnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+
+define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) {
+; RV32-LABEL: add_i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmv.s.x v10, zero
+; RV32-NEXT: vredsum.vs v8, v8, v10
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vredsum.vs v8, v9, v10
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: add_i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.s.x v10, zero
+; RV64-NEXT: vredsum.vs v8, v8, v10
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vredsum.vs v8, v9, v10
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: addw a0, a0, a1
+; RV64-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
+ %r = add i32 %r1, %r2
+ ret i32 %r
+}
+
+define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: add_ext_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vwredsumu.vs v8, v8, v10
+; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vwredsumu.vs v8, v9, v10
+; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: ret
+ %ae = zext <16 x i8> %a to <16 x i16>
+ %be = zext <16 x i8> %b to <16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: add_ext_v32i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v11, zero
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vwredsumu.vs v10, v10, v11
+; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v10
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT: vwredsumu.vs v8, v8, v10
+; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %ae = zext <32 x i8> %a to <32 x i16>
+ %be = zext <16 x i8> %b to <16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
+; RV32-LABEL: mul_i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 2
+; RV32-NEXT: vmul.vv v8, v8, v10
+; RV32-NEXT: vrgather.vi v10, v8, 1
+; RV32-NEXT: vmul.vv v8, v8, v10
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vslidedown.vi v8, v9, 2
+; RV32-NEXT: vmul.vv v8, v9, v8
+; RV32-NEXT: vrgather.vi v9, v8, 1
+; RV32-NEXT: vmul.vv v8, v8, v9
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mul_i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 2
+; RV64-NEXT: vmul.vv v8, v8, v10
+; RV64-NEXT: vrgather.vi v10, v8, 1
+; RV64-NEXT: vmul.vv v8, v8, v10
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vslidedown.vi v8, v9, 2
+; RV64-NEXT: vmul.vv v8, v9, v8
+; RV64-NEXT: vrgather.vi v9, v8, 1
+; RV64-NEXT: vmul.vv v8, v8, v9
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: mulw a0, a0, a1
+; RV64-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
+ %r = mul i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: and_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, -1
+; CHECK-NEXT: vredand.vs v8, v8, v10
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vredand.vs v8, v9, v10
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
+ %r = and i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: or_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vredor.vs v8, v8, v10
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vredor.vs v8, v9, v10
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
+ %r = or i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: xor_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vredxor.vs v8, v8, v10
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vredxor.vs v8, v9, v10
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: xor a0, a0, a1
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
+ %r = xor i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umin_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, -1
+; CHECK-NEXT: vredminu.vs v8, v8, v10
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vredminu.vs v8, v9, v10
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umax_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vredmaxu.vs v8, v8, v10
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vredmaxu.vs v8, v9, v10
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: bltu a1, a0, .LBB12_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB12_2:
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) {
+; RV32-LABEL: smin_i32:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a0, 524288
+; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmv.s.x v10, a0
+; RV32-NEXT: vredmin.vs v8, v8, v10
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vredmin.vs v8, v9, v10
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: blt a0, a1, .LBB13_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a0, a1
+; RV32-NEXT: .LBB13_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: smin_i32:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a0, 524288
+; RV64-NEXT: addiw a0, a0, -1
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.s.x v10, a0
+; RV64-NEXT: vredmin.vs v8, v8, v10
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vredmin.vs v8, v9, v10
+; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: blt a0, a1, .LBB13_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a0, a1
+; RV64-NEXT: .LBB13_2:
+; RV64-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: smax_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a0
+; CHECK-NEXT: vredmax.vs v8, v8, v10
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vredmax.vs v8, v9, v10
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: blt a1, a0, .LBB14_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB14_2:
+; CHECK-NEXT: ret
+ %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
new file mode 100644
index 0000000000000..975f7b43067bb
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
@@ -0,0 +1,329 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -float-abi=hard -verify-machineinstrs %s -o - | FileCheck %s
+
+define float @add_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: add_f32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vadd.f32 q0, q0, q1
+; CHECK-NEXT: vadd.f32 s4, s10, s11
+; CHECK-NEXT: vadd.f32 s2, s2, s3
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: vadd.f32 s6, s8, s9
+; CHECK-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NEXT: vadd.f32 s2, s6, s4
+; CHECK-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NEXT: bx lr
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
+ %r = fadd fast float %r1, %r2
+ ret float %r
+}
+
+define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmul_f32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmul.f32 q0, q0, q1
+; CHECK-NEXT: vmul.f32 s4, s10, s11
+; CHECK-NEXT: vmul.f32 s2, s2, s3
+; CHECK-NEXT: vmul.f32 s0, s0, s1
+; CHECK-NEXT: vmul.f32 s6, s8, s9
+; CHECK-NEXT: vmul.f32 s0, s0, s2
+; CHECK-NEXT: vmul.f32 s2, s6, s4
+; CHECK-NEXT: vmul.f32 s0, s0, s2
+; CHECK-NEXT: bx lr
+ %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
+ %r = fmul fast float %r1, %r2
+ ret float %r
+}
+
+define float @fmin_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmin_f32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vminnm.f32 q0, q0, q1
+; CHECK-NEXT: vminnm.f32 s4, s8, s9
+; CHECK-NEXT: vminnm.f32 s2, s2, s3
+; CHECK-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-NEXT: vminnm.f32 s0, s0, s2
+; CHECK-NEXT: vminnm.f32 s2, s10, s11
+; CHECK-NEXT: vminnm.f32 s2, s4, s2
+; CHECK-NEXT: vminnm.f32 s0, s0, s2
+; CHECK-NEXT: bx lr
+ %r1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
+ %r = call float @llvm.minnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+define float @fmax_f32(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmax_f32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmaxnm.f32 q0, q0, q1
+; CHECK-NEXT: vmaxnm.f32 s4, s8, s9
+; CHECK-NEXT: vmaxnm.f32 s2, s2, s3
+; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-NEXT: vmaxnm.f32 s0, s0, s2
+; CHECK-NEXT: vmaxnm.f32 s2, s10, s11
+; CHECK-NEXT: vmaxnm.f32 s2, s4, s2
+; CHECK-NEXT: vmaxnm.f32 s0, s0, s2
+; CHECK-NEXT: bx lr
+ %r1 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
+ %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
+ %r = call float @llvm.maxnum.f32(float %r1, float %r2)
+ ret float %r
+}
+
+
+define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: add_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vaddv.u32 r0, q1
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: vaddva.u32 r0, q2
+; CHECK-NEXT: bx lr
+ %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
+ %r = add i32 %r1, %r2
+ ret i32 %r
+}
+
+define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: add_ext_i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vaddv.u8 r0, q1
+; CHECK-NEXT: vaddva.u8 r0, q0
+; CHECK-NEXT: bx lr
+ %ae = zext <16 x i8> %a to <16 x i16>
+ %be = zext <16 x i8> %b to <16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: add_ext_v32i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: add r2, sp, #16
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q1, [r2]
+; CHECK-NEXT: vldrb.u16 q1, [r2]
+; CHECK-NEXT: vldrb.u16 q0, [r1]
+; CHECK-NEXT: vaddv.u16 r0, q1
+; CHECK-NEXT: vaddva.u16 r0, q0
+; CHECK-NEXT: vldrb.u16 q0, [r1, #8]
+; CHECK-NEXT: vaddva.u16 r0, q0
+; CHECK-NEXT: vldrb.u16 q0, [r2, #8]
+; CHECK-NEXT: vaddva.u16 r0, q0
+; CHECK-NEXT: vaddva.u8 r0, q2
+; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: bx lr
+ %ae = zext <32 x i8> %a to <32 x i16>
+ %be = zext <16 x i8> %b to <16 x i16>
+ %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
+ %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
+ %r = add i16 %r1, %r2
+ ret i16 %r
+}
+
+define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: mul_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vmul.i32 q0, q0, q1
+; CHECK-NEXT: vmov r0, r1, d5
+; CHECK-NEXT: vmov r6, r3, d0
+; CHECK-NEXT: vmov r12, lr, d1
+; CHECK-NEXT: vmov r4, r5, d4
+; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: mul r2, r12, lr
+; CHECK-NEXT: muls r3, r6, r3
+; CHECK-NEXT: mul r1, r4, r5
+; CHECK-NEXT: muls r2, r3, r2
+; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: muls r0, r2, r0
+; CHECK-NEXT: pop {r4, r5, r6, pc}
+ %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
+ %r = mul i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: and_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov r6, r1, d5
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: vmov r12, lr, d1
+; CHECK-NEXT: vmov r4, r5, d4
+; CHECK-NEXT: ands r1, r6
+; CHECK-NEXT: ands r2, r3
+; CHECK-NEXT: and.w r0, r12, lr
+; CHECK-NEXT: ands r0, r2
+; CHECK-NEXT: and.w r2, r4, r5
+; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: pop {r4, r5, r6, pc}
+ %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
+ %r = and i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: or_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov r6, r1, d5
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: vmov r12, lr, d1
+; CHECK-NEXT: vmov r4, r5, d4
+; CHECK-NEXT: orrs r1, r6
+; CHECK-NEXT: orrs r2, r3
+; CHECK-NEXT: orr.w r0, r12, lr
+; CHECK-NEXT: orrs r0, r2
+; CHECK-NEXT: orr.w r2, r4, r5
+; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: pop {r4, r5, r6, pc}
+ %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
+ %r = or i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: xor_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov r6, r1, d5
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: vmov r12, lr, d1
+; CHECK-NEXT: vmov r4, r5, d4
+; CHECK-NEXT: eors r1, r6
+; CHECK-NEXT: eors r2, r3
+; CHECK-NEXT: eor.w r0, r12, lr
+; CHECK-NEXT: eors r0, r2
+; CHECK-NEXT: eor.w r2, r4, r5
+; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: pop {r4, r5, r6, pc}
+ %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
+ %r = xor i32 %r1, %r2
+ ret i32 %r
+}
+
+define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umin_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov.w r0, #-1
+; CHECK-NEXT: vmin.u32 q0, q0, q1
+; CHECK-NEXT: mov.w r1, #-1
+; CHECK-NEXT: vminv.u32 r0, q2
+; CHECK-NEXT: vminv.u32 r1, q0
+; CHECK-NEXT: cmp r1, r0
+; CHECK-NEXT: csel r0, r1, r0, lo
+; CHECK-NEXT: bx lr
+ %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umax_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: vmax.u32 q0, q0, q1
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: vmaxv.u32 r0, q2
+; CHECK-NEXT: vmaxv.u32 r1, q0
+; CHECK-NEXT: cmp r1, r0
+; CHECK-NEXT: csel r0, r1, r0, hi
+; CHECK-NEXT: bx lr
+ %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: smin_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mvn r0, #-2147483648
+; CHECK-NEXT: vmin.s32 q0, q0, q1
+; CHECK-NEXT: mvn r1, #-2147483648
+; CHECK-NEXT: vminv.s32 r0, q2
+; CHECK-NEXT: vminv.s32 r1, q0
+; CHECK-NEXT: cmp r1, r0
+; CHECK-NEXT: csel r0, r1, r0, lt
+; CHECK-NEXT: bx lr
+ %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: smax_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov.w r0, #-2147483648
+; CHECK-NEXT: vmax.s32 q0, q0, q1
+; CHECK-NEXT: mov.w r1, #-2147483648
+; CHECK-NEXT: vmaxv.s32 r0, q2
+; CHECK-NEXT: vmaxv.s32 r1, q0
+; CHECK-NEXT: cmp r1, r0
+; CHECK-NEXT: csel r0, r1, r0, gt
+; CHECK-NEXT: bx lr
+ %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
+ %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
+ %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
+ ret i32 %r
+}
+
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
More information about the llvm-commits
mailing list