[llvm] [AArch64][SLP] Add NFC test cases for floating point reductions (PR #106507)
Sushant Gokhale via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 12 10:36:35 PDT 2024
https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/106507
>From df295b4b069dbe27e844821c4d273dd1fd5df2b8 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Thu, 29 Aug 2024 13:33:56 +0530
Subject: [PATCH] [AArch64][SLP] Add NFC test cases for floating point
reductions
A successive patch would be added to fix some of the tests.
---
.../Analysis/CostModel/AArch64/reduce-fadd.ll | 42 +
.../SLPVectorizer/AArch64/reduce-fadd.ll | 838 ++++++++++++++++++
2 files changed, 880 insertions(+)
create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index a68c21f7943432..58cb8c2c6a8d81 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -7,8 +7,11 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @strict_fp_reductions() {
; CHECK-LABEL: 'strict_fp_reductions'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
@@ -18,8 +21,11 @@ define void @strict_fp_reductions() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; FP16-LABEL: 'strict_fp_reductions'
+; FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
@@ -29,8 +35,11 @@ define void @strict_fp_reductions() {
; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; BF16-LABEL: 'strict_fp_reductions'
+; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
@@ -39,8 +48,11 @@ define void @strict_fp_reductions() {
; BF16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
%fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
%fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
+ %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0.0, <16 x half> undef)
+ %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.0, <2 x float> undef)
%fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
%fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
%fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
@@ -54,12 +66,18 @@ define void @strict_fp_reductions() {
define void @fast_fp_reductions() {
; CHECK-LABEL: 'fast_fp_reductions'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
@@ -77,12 +95,18 @@ define void @fast_fp_reductions() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; FP16-LABEL: 'fast_fp_reductions'
+; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
@@ -100,12 +124,18 @@ define void @fast_fp_reductions() {
; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; BF16-LABEL: 'fast_fp_reductions'
+; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
@@ -122,15 +152,24 @@ define void @fast_fp_reductions() {
; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
+ %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
+
%fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
%fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
%fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
%fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
+ %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0.0, <16 x half> undef)
+ %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0.0, <16 x half> undef)
+
%fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0.0, <11 x half> undef)
%fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0.0, <13 x half> undef)
+ %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.0, <2 x float> undef)
+ %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.0, <2 x float> undef)
+
%fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
%fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
@@ -158,11 +197,14 @@ define void @fast_fp_reductions() {
declare bfloat @llvm.vector.reduce.fadd.v4f8(bfloat, <4 x bfloat>)
declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
+declare half @llvm.vector.reduce.fadd.v2f16(half, <2 x half>)
declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
declare half @llvm.vector.reduce.fadd.v11f16(half, <11 x half>)
declare half @llvm.vector.reduce.fadd.v13f16(half, <13 x half>)
+declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
declare float @llvm.vector.reduce.fadd.v13f32(float, <13 x float>)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
new file mode 100644
index 00000000000000..c3f0c0192042a8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -0,0 +1,838 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \
+; RUN: -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16
+; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \
+; RUN: -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FP16
+
+define half @reduce_fast_half2(<2 x half> %vec2) {
+; CHECK-LABEL: define half @reduce_fast_half2(
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; CHECK-NEXT: ret half [[ADD1]]
+entry:
+ %elt0 = extractelement <2 x half> %vec2, i64 0
+ %elt1 = extractelement <2 x half> %vec2, i64 1
+ %add1 = fadd fast half %elt1, %elt0
+ ret half %add1
+}
+
+define half @reduce_half2(<2 x half> %vec2) {
+; CHECK-LABEL: define half @reduce_half2(
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; CHECK-NEXT: [[ADD1:%.*]] = fadd half [[ELT1]], [[ELT0]]
+; CHECK-NEXT: ret half [[ADD1]]
+entry:
+ %elt0 = extractelement <2 x half> %vec2, i64 0
+ %elt1 = extractelement <2 x half> %vec2, i64 1
+ %add1 = fadd half %elt1, %elt0
+ ret half %add1
+}
+
+define half @reduce_fast_half4(<4 x half> %vec4) {
+; CHECK-LABEL: define half @reduce_fast_half4(
+; CHECK-SAME: <4 x half> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[VEC4]])
+; CHECK-NEXT: ret half [[TMP0]]
+entry:
+ %elt0 = extractelement <4 x half> %vec4, i64 0
+ %elt1 = extractelement <4 x half> %vec4, i64 1
+ %elt2 = extractelement <4 x half> %vec4, i64 2
+ %elt3 = extractelement <4 x half> %vec4, i64 3
+ %add1 = fadd fast half %elt1, %elt0
+ %add2 = fadd fast half %elt2, %add1
+ %add3 = fadd fast half %elt3, %add2
+ ret half %add3
+}
+
+define half @reduce_half4(<4 x half> %vec4) {
+; CHECK-LABEL: define half @reduce_half4(
+; CHECK-SAME: <4 x half> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
+; CHECK-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
+; CHECK-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
+; CHECK-NEXT: [[ADD1:%.*]] = fadd half [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd half [[ELT2]], [[ADD1]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd half [[ELT3]], [[ADD2]]
+; CHECK-NEXT: ret half [[ADD3]]
+entry:
+ %elt0 = extractelement <4 x half> %vec4, i64 0
+ %elt1 = extractelement <4 x half> %vec4, i64 1
+ %elt2 = extractelement <4 x half> %vec4, i64 2
+ %elt3 = extractelement <4 x half> %vec4, i64 3
+ %add1 = fadd half %elt1, %elt0
+ %add2 = fadd half %elt2, %add1
+ %add3 = fadd half %elt3, %add2
+ ret half %add3
+}
+
+define half @reduce_fast_half8(<8 x half> %vec8) {
+; CHECK-LABEL: define half @reduce_fast_half8(
+; CHECK-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
+; CHECK-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
+; CHECK-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
+; CHECK-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[TMP0]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
+; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
+; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
+; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; CHECK-NEXT: ret half [[OP_RDX3]]
+entry:
+ %elt0 = extractelement <8 x half> %vec8, i64 0
+ %elt1 = extractelement <8 x half> %vec8, i64 1
+ %elt2 = extractelement <8 x half> %vec8, i64 2
+ %elt3 = extractelement <8 x half> %vec8, i64 3
+ %elt4 = extractelement <8 x half> %vec8, i64 4
+ %elt5 = extractelement <8 x half> %vec8, i64 5
+ %elt6 = extractelement <8 x half> %vec8, i64 6
+ %elt7 = extractelement <8 x half> %vec8, i64 7
+ %add1 = fadd fast half %elt1, %elt0
+ %add2 = fadd fast half %elt2, %add1
+ %add3 = fadd fast half %elt3, %add2
+ %add4 = fadd fast half %elt4, %add3
+ %add5 = fadd fast half %elt5, %add4
+ %add6 = fadd fast half %elt6, %add5
+ %add7 = fadd fast half %elt7, %add6
+ ret half %add7
+}
+
+define half @reduce_half8(<8 x half> %vec8) {
+; CHECK-LABEL: define half @reduce_half8(
+; CHECK-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <8 x half> [[VEC8]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1
+; CHECK-NEXT: [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2
+; CHECK-NEXT: [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3
+; CHECK-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
+; CHECK-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
+; CHECK-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
+; CHECK-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
+; CHECK-NEXT: [[ADD1:%.*]] = fadd half [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd half [[ELT2]], [[ADD1]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd half [[ELT3]], [[ADD2]]
+; CHECK-NEXT: [[ADD4:%.*]] = fadd half [[ELT4]], [[ADD3]]
+; CHECK-NEXT: [[ADD5:%.*]] = fadd half [[ELT5]], [[ADD4]]
+; CHECK-NEXT: [[ADD6:%.*]] = fadd half [[ELT6]], [[ADD5]]
+; CHECK-NEXT: [[ADD7:%.*]] = fadd half [[ELT7]], [[ADD6]]
+; CHECK-NEXT: ret half [[ADD7]]
+entry:
+ %elt0 = extractelement <8 x half> %vec8, i64 0
+ %elt1 = extractelement <8 x half> %vec8, i64 1
+ %elt2 = extractelement <8 x half> %vec8, i64 2
+ %elt3 = extractelement <8 x half> %vec8, i64 3
+ %elt4 = extractelement <8 x half> %vec8, i64 4
+ %elt5 = extractelement <8 x half> %vec8, i64 5
+ %elt6 = extractelement <8 x half> %vec8, i64 6
+ %elt7 = extractelement <8 x half> %vec8, i64 7
+ %add1 = fadd half %elt1, %elt0
+ %add2 = fadd half %elt2, %add1
+ %add3 = fadd half %elt3, %add2
+ %add4 = fadd half %elt4, %add3
+ %add5 = fadd half %elt5, %add4
+ %add6 = fadd half %elt6, %add5
+ %add7 = fadd half %elt7, %add6
+ ret half %add7
+}
+
+define half @reduce_fast_half16(<16 x half> %vec16) {
+; NOFP16-LABEL: define half @reduce_fast_half16(
+; NOFP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
+; NOFP16-NEXT: [[ENTRY:.*:]]
+; NOFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[VEC16]])
+; NOFP16-NEXT: ret half [[TMP0]]
+; FP16-LABEL: define half @reduce_fast_half16(
+; FP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
+; FP16-NEXT: [[ENTRY:.*:]]
+; FP16-NEXT: [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
+; FP16-NEXT: [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
+; FP16-NEXT: [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
+; FP16-NEXT: [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
+; FP16-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
+; FP16-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
+; FP16-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
+; FP16-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
+; FP16-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[TMP0]])
+; FP16-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; FP16-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[TMP2]])
+; FP16-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
+; FP16-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[OP_RDX]], [[ELT4]]
+; FP16-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
+; FP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[ELT7]], [[ELT12]]
+; FP16-NEXT: [[OP_RDX4:%.*]] = fadd fast half [[ELT13]], [[ELT14]]
+; FP16-NEXT: [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX1]], [[OP_RDX2]]
+; FP16-NEXT: [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX3]], [[OP_RDX4]]
+; FP16-NEXT: [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX5]], [[OP_RDX6]]
+; FP16-NEXT: [[OP_RDX8:%.*]] = fadd fast half [[OP_RDX7]], [[ELT15]]
+; FP16-NEXT: ret half [[OP_RDX8]]
+entry:
+ %elt0 = extractelement <16 x half> %vec16, i64 0
+ %elt1 = extractelement <16 x half> %vec16, i64 1
+ %elt2 = extractelement <16 x half> %vec16, i64 2
+ %elt3 = extractelement <16 x half> %vec16, i64 3
+ %elt4 = extractelement <16 x half> %vec16, i64 4
+ %elt5 = extractelement <16 x half> %vec16, i64 5
+ %elt6 = extractelement <16 x half> %vec16, i64 6
+ %elt7 = extractelement <16 x half> %vec16, i64 7
+ %elt8 = extractelement <16 x half> %vec16, i64 8
+ %elt9 = extractelement <16 x half> %vec16, i64 9
+ %elt10 = extractelement <16 x half> %vec16, i64 10
+ %elt11 = extractelement <16 x half> %vec16, i64 11
+ %elt12 = extractelement <16 x half> %vec16, i64 12
+ %elt13 = extractelement <16 x half> %vec16, i64 13
+ %elt14 = extractelement <16 x half> %vec16, i64 14
+ %elt15 = extractelement <16 x half> %vec16, i64 15
+ %add1 = fadd fast half %elt1, %elt0
+ %add2 = fadd fast half %elt2, %add1
+ %add3 = fadd fast half %elt3, %add2
+ %add4 = fadd fast half %elt4, %add3
+ %add5 = fadd fast half %elt5, %add4
+ %add6 = fadd fast half %elt6, %add5
+ %add7 = fadd fast half %elt7, %add6
+ %add8 = fadd fast half %elt8, %add7
+ %add9 = fadd fast half %elt9, %add8
+ %add10 = fadd fast half %elt10, %add9
+ %add11 = fadd fast half %elt11, %add10
+ %add12 = fadd fast half %elt12, %add11
+ %add13 = fadd fast half %elt13, %add12
+ %add14 = fadd fast half %elt14, %add13
+ %add15 = fadd fast half %elt15, %add14
+ ret half %add15
+}
+
+define half @reduce_half16(<16 x half> %vec16) {
+; CHECK-LABEL: define half @reduce_half16(
+; CHECK-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <16 x half> [[VEC16]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1
+; CHECK-NEXT: [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2
+; CHECK-NEXT: [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3
+; CHECK-NEXT: [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
+; CHECK-NEXT: [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
+; CHECK-NEXT: [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
+; CHECK-NEXT: [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
+; CHECK-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8
+; CHECK-NEXT: [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
+; CHECK-NEXT: [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
+; CHECK-NEXT: [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
+; CHECK-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
+; CHECK-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
+; CHECK-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
+; CHECK-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
+; CHECK-NEXT: [[ADD1:%.*]] = fadd half [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd half [[ELT2]], [[ADD1]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd half [[ELT3]], [[ADD2]]
+; CHECK-NEXT: [[ADD4:%.*]] = fadd half [[ELT4]], [[ADD3]]
+; CHECK-NEXT: [[ADD5:%.*]] = fadd half [[ELT5]], [[ADD4]]
+; CHECK-NEXT: [[ADD6:%.*]] = fadd half [[ELT6]], [[ADD5]]
+; CHECK-NEXT: [[ADD7:%.*]] = fadd half [[ELT7]], [[ADD6]]
+; CHECK-NEXT: [[ADD8:%.*]] = fadd half [[ELT8]], [[ADD7]]
+; CHECK-NEXT: [[ADD9:%.*]] = fadd half [[ELT9]], [[ADD8]]
+; CHECK-NEXT: [[ADD10:%.*]] = fadd half [[ELT10]], [[ADD9]]
+; CHECK-NEXT: [[ADD11:%.*]] = fadd half [[ELT11]], [[ADD10]]
+; CHECK-NEXT: [[ADD12:%.*]] = fadd half [[ELT12]], [[ADD11]]
+; CHECK-NEXT: [[ADD13:%.*]] = fadd half [[ELT13]], [[ADD12]]
+; CHECK-NEXT: [[ADD14:%.*]] = fadd half [[ELT14]], [[ADD13]]
+; CHECK-NEXT: [[ADD15:%.*]] = fadd half [[ELT15]], [[ADD14]]
+; CHECK-NEXT: ret half [[ADD15]]
+entry:
+ %elt0 = extractelement <16 x half> %vec16, i64 0
+ %elt1 = extractelement <16 x half> %vec16, i64 1
+ %elt2 = extractelement <16 x half> %vec16, i64 2
+ %elt3 = extractelement <16 x half> %vec16, i64 3
+ %elt4 = extractelement <16 x half> %vec16, i64 4
+ %elt5 = extractelement <16 x half> %vec16, i64 5
+ %elt6 = extractelement <16 x half> %vec16, i64 6
+ %elt7 = extractelement <16 x half> %vec16, i64 7
+ %elt8 = extractelement <16 x half> %vec16, i64 8
+ %elt9 = extractelement <16 x half> %vec16, i64 9
+ %elt10 = extractelement <16 x half> %vec16, i64 10
+ %elt11 = extractelement <16 x half> %vec16, i64 11
+ %elt12 = extractelement <16 x half> %vec16, i64 12
+ %elt13 = extractelement <16 x half> %vec16, i64 13
+ %elt14 = extractelement <16 x half> %vec16, i64 14
+ %elt15 = extractelement <16 x half> %vec16, i64 15
+ %add1 = fadd half %elt1, %elt0
+ %add2 = fadd half %elt2, %add1
+ %add3 = fadd half %elt3, %add2
+ %add4 = fadd half %elt4, %add3
+ %add5 = fadd half %elt5, %add4
+ %add6 = fadd half %elt6, %add5
+ %add7 = fadd half %elt7, %add6
+ %add8 = fadd half %elt8, %add7
+ %add9 = fadd half %elt9, %add8
+ %add10 = fadd half %elt10, %add9
+ %add11 = fadd half %elt11, %add10
+ %add12 = fadd half %elt12, %add11
+ %add13 = fadd half %elt13, %add12
+ %add14 = fadd half %elt14, %add13
+ %add15 = fadd half %elt15, %add14
+ ret half %add15
+}
+
+define float @reduce_fast_float2(<2 x float> %vec2) {
+; CHECK-LABEL: define float @reduce_fast_float2(
+; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
+; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT: ret float [[ADD1]]
+entry:
+ %elt0 = extractelement <2 x float> %vec2, i64 0
+ %elt1 = extractelement <2 x float> %vec2, i64 1
+ %add1 = fadd fast float %elt1, %elt0
+ ret float %add1
+}
+
+define float @reduce_float2(<2 x float> %vec2) {
+; CHECK-LABEL: define float @reduce_float2(
+; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[ELT1]], [[ELT0]]
+; CHECK-NEXT: ret float [[ADD1]]
+entry:
+ %elt0 = extractelement <2 x float> %vec2, i64 0
+ %elt1 = extractelement <2 x float> %vec2, i64 1
+ %add1 = fadd float %elt1, %elt0
+ ret float %add1
+}
+
+define float @reduce_fast_float4(<4 x float> %vec4) {
+; CHECK-LABEL: define float @reduce_fast_float4(
+; CHECK-SAME: <4 x float> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[VEC4]])
+; CHECK-NEXT: ret float [[TMP0]]
+entry:
+ %elt0 = extractelement <4 x float> %vec4, i64 0
+ %elt1 = extractelement <4 x float> %vec4, i64 1
+ %elt2 = extractelement <4 x float> %vec4, i64 2
+ %elt3 = extractelement <4 x float> %vec4, i64 3
+ %add1 = fadd fast float %elt1, %elt0
+ %add2 = fadd fast float %elt2, %add1
+ %add3 = fadd fast float %elt3, %add2
+ ret float %add3
+}
+
+define float @reduce_float4(<4 x float> %vec4) {
+; CHECK-LABEL: define float @reduce_float4(
+; CHECK-SAME: <4 x float> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[VEC4]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[VEC4]], i64 1
+; CHECK-NEXT: [[ELT2:%.*]] = extractelement <4 x float> [[VEC4]], i64 2
+; CHECK-NEXT: [[ELT3:%.*]] = extractelement <4 x float> [[VEC4]], i64 3
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[ELT2]], [[ADD1]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[ELT3]], [[ADD2]]
+; CHECK-NEXT: ret float [[ADD3]]
+entry:
+ %elt0 = extractelement <4 x float> %vec4, i64 0
+ %elt1 = extractelement <4 x float> %vec4, i64 1
+ %elt2 = extractelement <4 x float> %vec4, i64 2
+ %elt3 = extractelement <4 x float> %vec4, i64 3
+ %add1 = fadd float %elt1, %elt0
+ %add2 = fadd float %elt2, %add1
+ %add3 = fadd float %elt3, %add2
+ ret float %add3
+}
+
+define float @reduce_fast_float8(<8 x float> %vec8) {
+; CHECK-LABEL: define float @reduce_fast_float8(
+; CHECK-SAME: <8 x float> [[VEC8:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[VEC8]])
+; CHECK-NEXT: ret float [[TMP0]]
+entry:
+ %elt0 = extractelement <8 x float> %vec8, i64 0
+ %elt1 = extractelement <8 x float> %vec8, i64 1
+ %elt2 = extractelement <8 x float> %vec8, i64 2
+ %elt3 = extractelement <8 x float> %vec8, i64 3
+ %elt4 = extractelement <8 x float> %vec8, i64 4
+ %elt5 = extractelement <8 x float> %vec8, i64 5
+ %elt6 = extractelement <8 x float> %vec8, i64 6
+ %elt7 = extractelement <8 x float> %vec8, i64 7
+ %add1 = fadd fast float %elt1, %elt0
+ %add2 = fadd fast float %elt2, %add1
+ %add3 = fadd fast float %elt3, %add2
+ %add4 = fadd fast float %elt4, %add3
+ %add5 = fadd fast float %elt5, %add4
+ %add6 = fadd fast float %elt6, %add5
+ %add7 = fadd fast float %elt7, %add6
+ ret float %add7
+}
+
+define float @reduce_float8(<8 x float> %vec8) {
+; CHECK-LABEL: define float @reduce_float8(
+; CHECK-SAME: <8 x float> [[VEC8:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <8 x float> [[VEC8]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <8 x float> [[VEC8]], i64 1
+; CHECK-NEXT: [[ELT2:%.*]] = extractelement <8 x float> [[VEC8]], i64 2
+; CHECK-NEXT: [[ELT3:%.*]] = extractelement <8 x float> [[VEC8]], i64 3
+; CHECK-NEXT: [[ELT4:%.*]] = extractelement <8 x float> [[VEC8]], i64 4
+; CHECK-NEXT: [[ELT5:%.*]] = extractelement <8 x float> [[VEC8]], i64 5
+; CHECK-NEXT: [[ELT6:%.*]] = extractelement <8 x float> [[VEC8]], i64 6
+; CHECK-NEXT: [[ELT7:%.*]] = extractelement <8 x float> [[VEC8]], i64 7
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[ELT2]], [[ADD1]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[ELT3]], [[ADD2]]
+; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[ELT4]], [[ADD3]]
+; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ELT5]], [[ADD4]]
+; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[ELT6]], [[ADD5]]
+; CHECK-NEXT: [[ADD7:%.*]] = fadd float [[ELT7]], [[ADD6]]
+; CHECK-NEXT: ret float [[ADD7]]
+entry:
+ %elt0 = extractelement <8 x float> %vec8, i64 0
+ %elt1 = extractelement <8 x float> %vec8, i64 1
+ %elt2 = extractelement <8 x float> %vec8, i64 2
+ %elt3 = extractelement <8 x float> %vec8, i64 3
+ %elt4 = extractelement <8 x float> %vec8, i64 4
+ %elt5 = extractelement <8 x float> %vec8, i64 5
+ %elt6 = extractelement <8 x float> %vec8, i64 6
+ %elt7 = extractelement <8 x float> %vec8, i64 7
+ %add1 = fadd float %elt1, %elt0
+ %add2 = fadd float %elt2, %add1
+ %add3 = fadd float %elt3, %add2
+ %add4 = fadd float %elt4, %add3
+ %add5 = fadd float %elt5, %add4
+ %add6 = fadd float %elt6, %add5
+ %add7 = fadd float %elt7, %add6
+ ret float %add7
+}
+
+define double @reduce_fast_double2(<2 x double> %vec2) {
+; CHECK-LABEL: define double @reduce_fast_double2(
+; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
+; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT: ret double [[ADD1]]
+entry:
+ %elt0 = extractelement <2 x double> %vec2, i64 0
+ %elt1 = extractelement <2 x double> %vec2, i64 1
+ %add1 = fadd fast double %elt1, %elt0
+ ret double %add1
+}
+
+define double @reduce_double2(<2 x double> %vec2) {
+; CHECK-LABEL: define double @reduce_double2(
+; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
+; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[ELT1]], [[ELT0]]
+; CHECK-NEXT: ret double [[ADD1]]
+entry:
+ %elt0 = extractelement <2 x double> %vec2, i64 0
+ %elt1 = extractelement <2 x double> %vec2, i64 1
+ %add1 = fadd double %elt1, %elt0
+ ret double %add1
+}
+
+define double @reduce_fast_double4(<4 x double> %vec4) {
+; CHECK-LABEL: define double @reduce_fast_double4(
+; CHECK-SAME: <4 x double> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[VEC4]])
+; CHECK-NEXT: ret double [[TMP0]]
+entry:
+ %elt0 = extractelement <4 x double> %vec4, i64 0
+ %elt1 = extractelement <4 x double> %vec4, i64 1
+ %elt2 = extractelement <4 x double> %vec4, i64 2
+ %elt3 = extractelement <4 x double> %vec4, i64 3
+ %add1 = fadd fast double %elt1, %elt0
+ %add2 = fadd fast double %elt2, %add1
+ %add3 = fadd fast double %elt3, %add2
+ ret double %add3
+}
+
+define double @reduce_double4(<4 x double> %vec4) {
+; CHECK-LABEL: define double @reduce_double4(
+; CHECK-SAME: <4 x double> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x double> [[VEC4]], i64 0
+; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x double> [[VEC4]], i64 1
+; CHECK-NEXT: [[ELT2:%.*]] = extractelement <4 x double> [[VEC4]], i64 2
+; CHECK-NEXT: [[ELT3:%.*]] = extractelement <4 x double> [[VEC4]], i64 3
+; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd double [[ELT2]], [[ADD1]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[ELT3]], [[ADD2]]
+; CHECK-NEXT: ret double [[ADD3]]
+entry:
+ %elt0 = extractelement <4 x double> %vec4, i64 0
+ %elt1 = extractelement <4 x double> %vec4, i64 1
+ %elt2 = extractelement <4 x double> %vec4, i64 2
+ %elt3 = extractelement <4 x double> %vec4, i64 3
+ %add1 = fadd double %elt1, %elt0
+ %add2 = fadd double %elt2, %add1
+ %add3 = fadd double %elt3, %add2
+ ret double %add3
+}
+
+; Fixed iteration count. sum += a[i]
+define float @reduce_fast_float_case1(ptr %a) {
+; CHECK-LABEL: define float @reduce_fast_float_case1(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[LOAD1]], [[LOAD]]
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
+; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[LOAD2]], [[ADD1]]
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12
+; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
+; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[LOAD3]], [[ADD2]]
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
+; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
+; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[LOAD4]], [[ADD3]]
+; CHECK-NEXT: ret float [[ADD4]]
+entry:
+ %load = load float, ptr %a
+ %gep = getelementptr inbounds i8, ptr %a, i64 4
+ %load1 = load float, ptr %gep
+ %add1 = fadd fast float %load1, %load
+ %gep2 = getelementptr inbounds i8, ptr %a, i64 8
+ %load2 = load float, ptr %gep2
+ %add2 = fadd fast float %load2, %add1
+ %gep3 = getelementptr inbounds i8, ptr %a, i64 12
+ %load3 = load float, ptr %gep3
+ %add3 = fadd fast float %load3, %add2
+ %gep4 = getelementptr inbounds i8, ptr %a, i64 16
+ %load4 = load float, ptr %gep4
+ %add4 = fadd fast float %load4, %add3
+ ret float %add4
+}
+
+; Fixed iteration count. sum += a[i]
+define float @reduce_float_case1(ptr %a) {
+; CHECK-LABEL: define float @reduce_float_case1(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[LOAD1]], [[LOAD]]
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
+; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[LOAD2]], [[ADD1]]
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12
+; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
+; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[LOAD3]], [[ADD2]]
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
+; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
+; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[LOAD4]], [[ADD3]]
+; CHECK-NEXT: ret float [[ADD4]]
+entry:
+ %load = load float, ptr %a
+ %gep = getelementptr inbounds i8, ptr %a, i64 4
+ %load1 = load float, ptr %gep
+ %add1 = fadd float %load1, %load
+ %gep2 = getelementptr inbounds i8, ptr %a, i64 8
+ %load2 = load float, ptr %gep2
+ %add2 = fadd float %load2, %add1
+ %gep3 = getelementptr inbounds i8, ptr %a, i64 12
+ %load3 = load float, ptr %gep3
+ %add3 = fadd float %load3, %add2
+ %gep4 = getelementptr inbounds i8, ptr %a, i64 16
+ %load4 = load float, ptr %gep4
+ %add4 = fadd float %load4, %add3
+ ret float %add4
+}
+
+; Reduction needs a shuffle. See add2 and add3.
+define float @reduce_fast_float_case2(ptr %a, ptr %b) {
+; CHECK-LABEL: define float @reduce_fast_float_case2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[GEPA2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT: [[GEPA3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
+; CHECK-NEXT: [[GEPB2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT: [[GEPB3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3
+; CHECK-NEXT: [[LOADA2:%.*]] = load float, ptr [[GEPA2]], align 4
+; CHECK-NEXT: [[LOADA3:%.*]] = load float, ptr [[GEPA3]], align 4
+; CHECK-NEXT: [[LOADB2:%.*]] = load float, ptr [[GEPB2]], align 4
+; CHECK-NEXT: [[LOADB3:%.*]] = load float, ptr [[GEPB3]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[B]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[LOADA3]], [[LOADB2]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[LOADA2]], [[LOADB3]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[RED1:%.*]] = fadd fast float [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[RED2:%.*]] = fadd fast float [[ADD2]], [[RED1]]
+; CHECK-NEXT: [[RED3:%.*]] = fadd fast float [[ADD3]], [[RED2]]
+; CHECK-NEXT: ret float [[RED3]]
+entry:
+ %gepa1 = getelementptr inbounds float, ptr %a, i32 1
+ %gepa2 = getelementptr inbounds float, ptr %a, i32 2
+ %gepa3 = getelementptr inbounds float, ptr %a, i32 3
+ %gepb1 = getelementptr inbounds float, ptr %b, i32 1
+ %gepb2 = getelementptr inbounds float, ptr %b, i32 2
+ %gepb3 = getelementptr inbounds float, ptr %b, i32 3
+ %loada = load float, ptr %a
+ %loada1 = load float, ptr %gepa1
+ %loada2 = load float, ptr %gepa2
+ %loada3 = load float, ptr %gepa3
+ %loadb = load float, ptr %b
+ %loadb1 = load float, ptr %gepb1
+ %loadb2 = load float, ptr %gepb2
+ %loadb3 = load float, ptr %gepb3
+ %add = fadd fast float %loada, %loadb
+ %add1 = fadd fast float %loada1, %loadb1
+ %add2 = fadd fast float %loada3, %loadb2
+ %add3 = fadd fast float %loada2, %loadb3
+ %red1 = fadd fast float %add, %add1
+ %red2 = fadd fast float %add2, %red1
+ %red3 = fadd fast float %add3, %red2
+ ret float %red3
+}
+
+; Reduction needs a shuffle. See add2 and add3.
+define float @reduce_float_case2(ptr %a, ptr %b) {
+; CHECK-LABEL: define float @reduce_float_case2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[GEPA2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT: [[GEPA3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
+; CHECK-NEXT: [[GEPB2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT: [[GEPB3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3
+; CHECK-NEXT: [[LOADA2:%.*]] = load float, ptr [[GEPA2]], align 4
+; CHECK-NEXT: [[LOADA3:%.*]] = load float, ptr [[GEPA3]], align 4
+; CHECK-NEXT: [[LOADB2:%.*]] = load float, ptr [[GEPB2]], align 4
+; CHECK-NEXT: [[LOADB3:%.*]] = load float, ptr [[GEPB3]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[B]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[LOADA3]], [[LOADB2]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[LOADA2]], [[LOADB3]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[RED1:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[RED2:%.*]] = fadd float [[ADD2]], [[RED1]]
+; CHECK-NEXT: [[RED3:%.*]] = fadd float [[ADD3]], [[RED2]]
+; CHECK-NEXT: ret float [[RED3]]
+entry:
+ %gepa1 = getelementptr inbounds float, ptr %a, i32 1
+ %gepa2 = getelementptr inbounds float, ptr %a, i32 2
+ %gepa3 = getelementptr inbounds float, ptr %a, i32 3
+ %gepb1 = getelementptr inbounds float, ptr %b, i32 1
+ %gepb2 = getelementptr inbounds float, ptr %b, i32 2
+ %gepb3 = getelementptr inbounds float, ptr %b, i32 3
+ %loada = load float, ptr %a
+ %loada1 = load float, ptr %gepa1
+ %loada2 = load float, ptr %gepa2
+ %loada3 = load float, ptr %gepa3
+ %loadb = load float, ptr %b
+ %loadb1 = load float, ptr %gepb1
+ %loadb2 = load float, ptr %gepb2
+ %loadb3 = load float, ptr %gepb3
+ %add = fadd float %loada, %loadb
+ %add1 = fadd float %loada1, %loadb1
+ %add2 = fadd float %loada3, %loadb2
+ %add3 = fadd float %loada2, %loadb3
+ %red1 = fadd float %add, %add1
+ %red2 = fadd float %add2, %red1
+ %red3 = fadd float %add3, %red2
+ ret float %red3
+}
+
+; Addition of log.
+define float @reduce_fast_float_case3(ptr %a) {
+; CHECK-LABEL: define float @reduce_fast_float_case3(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds float, ptr [[A]], i32 4
+; CHECK-NEXT: [[GEP5:%.*]] = getelementptr inbounds float, ptr [[A]], i32 5
+; CHECK-NEXT: [[GEP6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 6
+; CHECK-NEXT: [[GEP7:%.*]] = getelementptr inbounds float, ptr [[A]], i32 7
+; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
+; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
+; CHECK-NEXT: [[LOAD5:%.*]] = load float, ptr [[GEP5]], align 4
+; CHECK-NEXT: [[LOAD6:%.*]] = load float, ptr [[GEP6]], align 4
+; CHECK-NEXT: [[LOAD7:%.*]] = load float, ptr [[GEP7]], align 4
+; CHECK-NEXT: [[LOG:%.*]] = call fast float @llvm.log.f32(float [[LOAD]])
+; CHECK-NEXT: [[LOG1:%.*]] = call fast float @llvm.log.f32(float [[LOAD1]])
+; CHECK-NEXT: [[LOG2:%.*]] = call fast float @llvm.log.f32(float [[LOAD2]])
+; CHECK-NEXT: [[LOG3:%.*]] = call fast float @llvm.log.f32(float [[LOAD3]])
+; CHECK-NEXT: [[LOG4:%.*]] = call fast float @llvm.log.f32(float [[LOAD4]])
+; CHECK-NEXT: [[LOG5:%.*]] = call fast float @llvm.log.f32(float [[LOAD5]])
+; CHECK-NEXT: [[LOG6:%.*]] = call fast float @llvm.log.f32(float [[LOAD6]])
+; CHECK-NEXT: [[LOG7:%.*]] = call fast float @llvm.log.f32(float [[LOAD7]])
+; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[LOG]], [[LOG1]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[ADD1]], [[LOG2]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[ADD2]], [[LOG3]]
+; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[ADD3]], [[LOG4]]
+; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[LOG5]]
+; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[ADD5]], [[LOG6]]
+; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD6]], [[LOG7]]
+; CHECK-NEXT: ret float [[ADD7]]
+entry:
+ %gep1 = getelementptr inbounds float, ptr %a, i32 1
+ %gep2 = getelementptr inbounds float, ptr %a, i32 2
+ %gep3 = getelementptr inbounds float, ptr %a, i32 3
+ %gep4 = getelementptr inbounds float, ptr %a, i32 4
+ %gep5 = getelementptr inbounds float, ptr %a, i32 5
+ %gep6 = getelementptr inbounds float, ptr %a, i32 6
+ %gep7 = getelementptr inbounds float, ptr %a, i32 7
+ %load = load float, ptr %a
+ %load1 = load float, ptr %gep1
+ %load2 = load float, ptr %gep2
+ %load3 = load float, ptr %gep3
+ %load4 = load float, ptr %gep4
+ %load5 = load float, ptr %gep5
+ %load6 = load float, ptr %gep6
+ %load7 = load float, ptr %gep7
+ %log = call fast float @llvm.log.f32(float %load)
+ %log1 = call fast float @llvm.log.f32(float %load1)
+ %log2 = call fast float @llvm.log.f32(float %load2)
+ %log3 = call fast float @llvm.log.f32(float %load3)
+ %log4 = call fast float @llvm.log.f32(float %load4)
+ %log5 = call fast float @llvm.log.f32(float %load5)
+ %log6 = call fast float @llvm.log.f32(float %load6)
+ %log7 = call fast float @llvm.log.f32(float %load7)
+ %add1 = fadd fast float %log, %log1
+ %add2 = fadd fast float %add1, %log2
+ %add3 = fadd fast float %add2, %log3
+ %add4 = fadd fast float %add3, %log4
+ %add5 = fadd fast float %add4, %log5
+ %add6 = fadd fast float %add5, %log6
+ %add7 = fadd fast float %add6, %log7
+ ret float %add7
+}
+
+; Addition of log.
+define float @reduce_float_case3(ptr %a) {
+; CHECK-LABEL: define float @reduce_float_case3(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds float, ptr [[A]], i32 4
+; CHECK-NEXT: [[GEP5:%.*]] = getelementptr inbounds float, ptr [[A]], i32 5
+; CHECK-NEXT: [[GEP6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 6
+; CHECK-NEXT: [[GEP7:%.*]] = getelementptr inbounds float, ptr [[A]], i32 7
+; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
+; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
+; CHECK-NEXT: [[LOAD5:%.*]] = load float, ptr [[GEP5]], align 4
+; CHECK-NEXT: [[LOAD6:%.*]] = load float, ptr [[GEP6]], align 4
+; CHECK-NEXT: [[LOAD7:%.*]] = load float, ptr [[GEP7]], align 4
+; CHECK-NEXT: [[LOG:%.*]] = call float @llvm.log.f32(float [[LOAD]])
+; CHECK-NEXT: [[LOG1:%.*]] = call float @llvm.log.f32(float [[LOAD1]])
+; CHECK-NEXT: [[LOG2:%.*]] = call float @llvm.log.f32(float [[LOAD2]])
+; CHECK-NEXT: [[LOG3:%.*]] = call float @llvm.log.f32(float [[LOAD3]])
+; CHECK-NEXT: [[LOG4:%.*]] = call float @llvm.log.f32(float [[LOAD4]])
+; CHECK-NEXT: [[LOG5:%.*]] = call float @llvm.log.f32(float [[LOAD5]])
+; CHECK-NEXT: [[LOG6:%.*]] = call float @llvm.log.f32(float [[LOAD6]])
+; CHECK-NEXT: [[LOG7:%.*]] = call float @llvm.log.f32(float [[LOAD7]])
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[LOG]], [[LOG1]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[ADD1]], [[LOG2]]
+; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[ADD2]], [[LOG3]]
+; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[ADD3]], [[LOG4]]
+; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD4]], [[LOG5]]
+; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[ADD5]], [[LOG6]]
+; CHECK-NEXT: [[ADD7:%.*]] = fadd float [[ADD6]], [[LOG7]]
+; CHECK-NEXT: ret float [[ADD7]]
+entry:
+ %gep1 = getelementptr inbounds float, ptr %a, i32 1
+ %gep2 = getelementptr inbounds float, ptr %a, i32 2
+ %gep3 = getelementptr inbounds float, ptr %a, i32 3
+ %gep4 = getelementptr inbounds float, ptr %a, i32 4
+ %gep5 = getelementptr inbounds float, ptr %a, i32 5
+ %gep6 = getelementptr inbounds float, ptr %a, i32 6
+ %gep7 = getelementptr inbounds float, ptr %a, i32 7
+ %load = load float, ptr %a
+ %load1 = load float, ptr %gep1
+ %load2 = load float, ptr %gep2
+ %load3 = load float, ptr %gep3
+ %load4 = load float, ptr %gep4
+ %load5 = load float, ptr %gep5
+ %load6 = load float, ptr %gep6
+ %load7 = load float, ptr %gep7
+ %log = call float @llvm.log.f32(float %load)
+ %log1 = call float @llvm.log.f32(float %load1)
+ %log2 = call float @llvm.log.f32(float %load2)
+ %log3 = call float @llvm.log.f32(float %load3)
+ %log4 = call float @llvm.log.f32(float %load4)
+ %log5 = call float @llvm.log.f32(float %load5)
+ %log6 = call float @llvm.log.f32(float %load6)
+ %log7 = call float @llvm.log.f32(float %load7)
+ %add1 = fadd float %log, %log1
+ %add2 = fadd float %add1, %log2
+ %add3 = fadd float %add2, %log3
+ %add4 = fadd float %add3, %log4
+ %add5 = fadd float %add4, %log5
+ %add6 = fadd float %add5, %log6
+ %add7 = fadd float %add6, %log7
+ ret float %add7
+}
+
+define half @reduce_unordered_fast_half4(<4 x half> %vec4) {
+; CHECK-LABEL: define half @reduce_unordered_fast_half4(
+; CHECK-SAME: <4 x half> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[VEC4]])
+; CHECK-NEXT: ret half [[TMP0]]
+entry:
+ %elt0 = extractelement <4 x half> %vec4, i64 0
+ %elt1 = extractelement <4 x half> %vec4, i64 1
+ %elt2 = extractelement <4 x half> %vec4, i64 2
+ %elt3 = extractelement <4 x half> %vec4, i64 3
+ %add1 = fadd fast half %elt1, %elt0
+ %add2 = fadd fast half %elt2, %elt3
+ %add3 = fadd fast half %add1, %add2
+ ret half %add3
+}
+
+define half @reduce_unordered_half4(<4 x half> %vec4) {
+; CHECK-LABEL: define half @reduce_unordered_half4(
+; CHECK-SAME: <4 x half> [[VEC4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x half> [[VEC4]], <4 x half> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VEC4]], <4 x half> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x half> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x half> [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP2]], i32 1
+; CHECK-NEXT: [[ADD3:%.*]] = fadd half [[TMP3]], [[TMP4]]
+; CHECK-NEXT: ret half [[ADD3]]
+entry:
+ %elt0 = extractelement <4 x half> %vec4, i64 0
+ %elt1 = extractelement <4 x half> %vec4, i64 1
+ %elt2 = extractelement <4 x half> %vec4, i64 2
+ %elt3 = extractelement <4 x half> %vec4, i64 3
+ %add1 = fadd half %elt1, %elt0
+ %add2 = fadd half %elt2, %elt3
+ %add3 = fadd half %add1, %add2
+ ret half %add3
+}
More information about the llvm-commits
mailing list