[llvm] [AArch64][SLP] Add NFC test cases for floating point reductions (PR #106507)

Thu Aug 29 01:09:40 PDT 2024

https://github.com/sushgokh created https://github.com/llvm/llvm-project/pull/106507

A successive patch would be added to fix some of the tests.

>From 2b064cde08c907f2c2772a315f2470e14546efe5 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Thu, 29 Aug 2024 13:33:56 +0530
Subject: [PATCH] [AArch64][SLP] Add NFC test cases for floating point
 reductions

A successive patch would be added to fix some of the tests.
---
 .../Analysis/CostModel/AArch64/reduce-fadd.ll |  72 +++---
 .../SLPVectorizer/AArch64/reduce-fadd.ll      | 225 ++++++++++++++++++
 2 files changed, 263 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index a68c21f7943432..c41a532f9f831e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -2,6 +2,8 @@
 ; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s
 ; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FP16
 ; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+bf16 < %s | FileCheck %s --check-prefix=BF16
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu \
+; RUN:     -mattr=+neoversev2 < %s | FileCheck %s --check-prefixes=FP16,NEOV2
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -17,17 +19,6 @@ define void @strict_fp_reductions() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; FP16-LABEL: 'strict_fp_reductions'
-; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
 ; BF16-LABEL: 'strict_fp_reductions'
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
@@ -38,6 +29,17 @@ define void @strict_fp_reductions() {
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NEOV2-LABEL: 'strict_fp_reductions'
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
   %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
@@ -76,29 +78,6 @@ define void @fast_fp_reductions() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; FP16-LABEL: 'fast_fp_reductions'
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
 ; BF16-LABEL: 'fast_fp_reductions'
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
@@ -121,6 +100,29 @@ define void @fast_fp_reductions() {
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NEOV2-LABEL: 'fast_fp_reductions'
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; NEOV2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
   %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
@@ -172,3 +174,5 @@ declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
 declare double @llvm.vector.reduce.fadd.v7f64(double, <7 x double>)
 declare double @llvm.vector.reduce.fadd.v9f64(double, <9 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FP16: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
new file mode 100644
index 00000000000000..10fd1f7e7a2995
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -S -mcpu=neoverse-v2 | FileCheck %s
+
+define half @reduction_half2(<2 x half> %vec2){
+; CHECK-LABEL: define half @reduction_half2(
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    ret half [[ADD1]]
+entry:
+  %elt0 = extractelement <2 x half> %vec2, i64 0
+  %elt1 = extractelement <2 x half> %vec2, i64 1
+  %add1 = fadd fast half %elt1, %elt0
+
+  ret half %add1
+}
+
+define half @reduction_half4(<4 x half> %vec4){
+; CHECK-LABEL: define half @reduction_half4(
+; CHECK-SAME: <4 x half> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[VEC4]])
+; CHECK-NEXT:    ret half [[TMP0]]
+entry:
+  %elt0 = extractelement <4 x half> %vec4, i64 0
+  %elt1 = extractelement <4 x half> %vec4, i64 1
+  %elt2 = extractelement <4 x half> %vec4, i64 2
+  %elt3 = extractelement <4 x half> %vec4, i64 3
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+
+  ret half %add3
+}
+
+define half @reduction_half8(<8 x half> %vec8){
+; CHECK-LABEL: define half @reduction_half8(
+; CHECK-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
+; CHECK-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
+; CHECK-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
+; CHECK-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; CHECK-NEXT:    ret half [[TMP0]]
+entry:
+  %elt0 = extractelement <8 x half> %vec8, i64 0
+  %elt1 = extractelement <8 x half> %vec8, i64 1
+  %elt2 = extractelement <8 x half> %vec8, i64 2
+  %elt3 = extractelement <8 x half> %vec8, i64 3
+  %elt4 = extractelement <8 x half> %vec8, i64 4
+  %elt5 = extractelement <8 x half> %vec8, i64 5
+  %elt6 = extractelement <8 x half> %vec8, i64 6
+  %elt7 = extractelement <8 x half> %vec8, i64 7
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+  %add4 = fadd fast half %elt4, %add3
+  %add5 = fadd fast half %elt5, %add4
+  %add6 = fadd fast half %elt6, %add5
+  %add7 = fadd fast half %elt7, %add6
+
+  ret half %add7
+}
+
+define half @reduction_half16(<16 x half> %vec16) {
+; CHECK-LABEL: define half @reduction_half16(
+; CHECK-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
+; CHECK-NEXT:    [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
+; CHECK-NEXT:    [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
+; CHECK-NEXT:    [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
+; CHECK-NEXT:    [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
+; CHECK-NEXT:    [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
+; CHECK-NEXT:    [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
+; CHECK-NEXT:    [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[TMP4]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[OP_RDX]], [[ELT4]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[ELT7]], [[ELT12]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = fadd fast half [[ELT13]], [[ELT14]]
+; CHECK-NEXT:    [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX1]], [[OP_RDX2]]
+; CHECK-NEXT:    [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX3]], [[OP_RDX4]]
+; CHECK-NEXT:    [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX5]], [[OP_RDX6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd fast half [[OP_RDX7]], [[ELT15]]
+; CHECK-NEXT:    ret half [[TMP0]]
+entry:
+  %elt0 = extractelement <16 x half> %vec16, i64 0
+  %elt1 = extractelement <16 x half> %vec16, i64 1
+  %elt2 = extractelement <16 x half> %vec16, i64 2
+  %elt3 = extractelement <16 x half> %vec16, i64 3
+  %elt4 = extractelement <16 x half> %vec16, i64 4
+  %elt5 = extractelement <16 x half> %vec16, i64 5
+  %elt6 = extractelement <16 x half> %vec16, i64 6
+  %elt7 = extractelement <16 x half> %vec16, i64 7
+  %elt8 = extractelement <16 x half> %vec16, i64 8
+  %elt9 = extractelement <16 x half> %vec16, i64 9
+  %elt10 = extractelement <16 x half> %vec16, i64 10
+  %elt11 = extractelement <16 x half> %vec16, i64 11
+  %elt12 = extractelement <16 x half> %vec16, i64 12
+  %elt13 = extractelement <16 x half> %vec16, i64 13
+  %elt14 = extractelement <16 x half> %vec16, i64 14
+  %elt15 = extractelement <16 x half> %vec16, i64 15
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+  %add4 = fadd fast half %elt4, %add3
+  %add5 = fadd fast half %elt5, %add4
+  %add6 = fadd fast half %elt6, %add5
+  %add7 = fadd fast half %elt7, %add6
+  %add8 = fadd fast half %elt8, %add7
+  %add9 = fadd fast half %elt9, %add8
+  %add10 = fadd fast half %elt10, %add9
+  %add11 = fadd fast half %elt11, %add10
+  %add12 = fadd fast half %elt12, %add11
+  %add13 = fadd fast half %elt13, %add12
+  %add14 = fadd fast half %elt14, %add13
+  %add15 = fadd fast half %elt15, %add14  ret half %add15
+}
+
+define float @reduction_float2(<2 x float> %vec2){
+; CHECK-LABEL: define float @reduction_float2(
+; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    ret float [[ADD1]]
+entry:
+  %elt0 = extractelement <2 x float> %vec2, i64 0
+  %elt1 = extractelement <2 x float> %vec2, i64 1
+  %add1 = fadd fast float %elt1, %elt0
+
+  ret float %add1
+}
+
+define float @reduction_float4(<4 x float> %vec4){
+; CHECK-LABEL: define float @reduction_float4(
+; CHECK-SAME: <4 x float> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[VEC4]])
+; CHECK-NEXT:    ret float [[TMP0]]
+entry:
+  %elt0 = extractelement <4 x float> %vec4, i64 0
+  %elt1 = extractelement <4 x float> %vec4, i64 1
+  %elt2 = extractelement <4 x float> %vec4, i64 2
+  %elt3 = extractelement <4 x float> %vec4, i64 3
+  %add1 = fadd fast float %elt1, %elt0
+  %add2 = fadd fast float %elt2, %add1
+  %add3 = fadd fast float %elt3, %add2
+
+  ret float %add3
+}
+
+define float @reduction_float8(<8 x float> %vec8){
+; CHECK-LABEL: define float @reduction_float8(
+; CHECK-SAME: <8 x float> [[VEC8:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[VEC8]])
+; CHECK-NEXT:    ret float [[TMP0]]
+entry:
+  %elt0 = extractelement <8 x float> %vec8, i64 0
+  %elt1 = extractelement <8 x float> %vec8, i64 1
+  %elt2 = extractelement <8 x float> %vec8, i64 2
+  %elt3 = extractelement <8 x float> %vec8, i64 3
+  %elt4 = extractelement <8 x float> %vec8, i64 4
+  %elt5 = extractelement <8 x float> %vec8, i64 5
+  %elt6 = extractelement <8 x float> %vec8, i64 6
+  %elt7 = extractelement <8 x float> %vec8, i64 7
+  %add1 = fadd fast float %elt1, %elt0
+  %add2 = fadd fast float %elt2, %add1
+  %add3 = fadd fast float %elt3, %add2
+  %add4 = fadd fast float %elt4, %add3
+  %add5 = fadd fast float %elt5, %add4
+  %add6 = fadd fast float %elt6, %add5
+  %add7 = fadd fast float %elt7, %add6
+
+  ret float %add7
+}
+
+define double @reduction_double2(<2 x double> %vec2){
+; CHECK-LABEL: define double @reduction_double2(
+; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    ret double [[ADD1]]
+entry:
+  %elt0 = extractelement <2 x double> %vec2, i64 0
+  %elt1 = extractelement <2 x double> %vec2, i64 1
+  %add1 = fadd fast double %elt1, %elt0
+
+  ret double %add1
+}
+
+define double @reduction_double4(<4 x double> %vec4){
+; CHECK-LABEL: define double @reduction_double4(
+; CHECK-SAME: <4 x double> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[VEC4]])
+; CHECK-NEXT:    ret double [[TMP0]]
+entry:
+  %elt0 = extractelement <4 x double> %vec4, i64 0
+  %elt1 = extractelement <4 x double> %vec4, i64 1
+  %elt2 = extractelement <4 x double> %vec4, i64 2
+  %elt3 = extractelement <4 x double> %vec4, i64 3
+  %add1 = fadd fast double %elt1, %elt0
+  %add2 = fadd fast double %elt2, %add1
+  %add3 = fadd fast double %elt3, %add2
+
+  ret double %add3
+}