[llvm] r326133 - [X86][SSE] Reduce FADD/FSUB/FMUL costs on later targets (PR36280)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 26 14:10:18 PST 2018
Author: rksimon
Date: Mon Feb 26 14:10:17 2018
New Revision: 326133
URL: http://llvm.org/viewvc/llvm-project?rev=326133&view=rev
Log:
[X86][SSE] Reduce FADD/FSUB/FMUL costs on later targets (PR36280)
Agner's tables indicate that for SSE42+ targets (Core2 and later) we can reduce the FADD/FSUB/FMUL costs down to 1, which should fix the Himeno benchmark.
Note: the AVX512 FDIV costs look rather dodgy, but this isn't part of this patch.
Differential Revision: https://reviews.llvm.org/D43733
Modified:
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll
llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll
llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll
llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Mon Feb 26 14:10:17 2018
@@ -438,6 +438,14 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+
+ { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v16i32, 16*20 },
{ ISD::SDIV, MVT::v8i64, 8*20 },
@@ -577,6 +585,13 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
{ ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+
{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
@@ -637,6 +652,21 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry SSE42CostTable[] = {
+ { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
{ ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
Modified: llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll Mon Feb 26 14:10:17 2018
@@ -11,53 +11,53 @@ target triple = "x86_64-apple-macosx10.8
; CHECK-LABEL: 'fadd'
define i32 @fadd(i32 %arg) {
; SSE2: cost of 2 {{.*}} %F32 = fadd
- ; SSE42: cost of 2 {{.*}} %F32 = fadd
- ; AVX: cost of 2 {{.*}} %F32 = fadd
- ; AVX2: cost of 2 {{.*}} %F32 = fadd
- ; AVX512: cost of 2 {{.*}} %F32 = fadd
+ ; SSE42: cost of 1 {{.*}} %F32 = fadd
+ ; AVX: cost of 1 {{.*}} %F32 = fadd
+ ; AVX2: cost of 1 {{.*}} %F32 = fadd
+ ; AVX512: cost of 1 {{.*}} %F32 = fadd
%F32 = fadd float undef, undef
; SSE2: cost of 2 {{.*}} %V4F32 = fadd
- ; SSE42: cost of 2 {{.*}} %V4F32 = fadd
- ; AVX: cost of 2 {{.*}} %V4F32 = fadd
- ; AVX2: cost of 2 {{.*}} %V4F32 = fadd
- ; AVX512: cost of 2 {{.*}} %V4F32 = fadd
+ ; SSE42: cost of 1 {{.*}} %V4F32 = fadd
+ ; AVX: cost of 1 {{.*}} %V4F32 = fadd
+ ; AVX2: cost of 1 {{.*}} %V4F32 = fadd
+ ; AVX512: cost of 1 {{.*}} %V4F32 = fadd
%V4F32 = fadd <4 x float> undef, undef
; SSE2: cost of 4 {{.*}} %V8F32 = fadd
- ; SSE42: cost of 4 {{.*}} %V8F32 = fadd
+ ; SSE42: cost of 2 {{.*}} %V8F32 = fadd
; AVX: cost of 2 {{.*}} %V8F32 = fadd
- ; AVX2: cost of 2 {{.*}} %V8F32 = fadd
- ; AVX512: cost of 2 {{.*}} %V8F32 = fadd
+ ; AVX2: cost of 1 {{.*}} %V8F32 = fadd
+ ; AVX512: cost of 1 {{.*}} %V8F32 = fadd
%V8F32 = fadd <8 x float> undef, undef
; SSE2: cost of 8 {{.*}} %V16F32 = fadd
- ; SSE42: cost of 8 {{.*}} %V16F32 = fadd
+ ; SSE42: cost of 4 {{.*}} %V16F32 = fadd
; AVX: cost of 4 {{.*}} %V16F32 = fadd
- ; AVX2: cost of 4 {{.*}} %V16F32 = fadd
- ; AVX512: cost of 2 {{.*}} %V16F32 = fadd
+ ; AVX2: cost of 2 {{.*}} %V16F32 = fadd
+ ; AVX512: cost of 1 {{.*}} %V16F32 = fadd
%V16F32 = fadd <16 x float> undef, undef
; SSE2: cost of 2 {{.*}} %F64 = fadd
- ; SSE42: cost of 2 {{.*}} %F64 = fadd
- ; AVX: cost of 2 {{.*}} %F64 = fadd
- ; AVX2: cost of 2 {{.*}} %F64 = fadd
- ; AVX512: cost of 2 {{.*}} %F64 = fadd
+ ; SSE42: cost of 1 {{.*}} %F64 = fadd
+ ; AVX: cost of 1 {{.*}} %F64 = fadd
+ ; AVX2: cost of 1 {{.*}} %F64 = fadd
+ ; AVX512: cost of 1 {{.*}} %F64 = fadd
%F64 = fadd double undef, undef
; SSE2: cost of 2 {{.*}} %V2F64 = fadd
- ; SSE42: cost of 2 {{.*}} %V2F64 = fadd
- ; AVX: cost of 2 {{.*}} %V2F64 = fadd
- ; AVX2: cost of 2 {{.*}} %V2F64 = fadd
- ; AVX512: cost of 2 {{.*}} %V2F64 = fadd
+ ; SSE42: cost of 1 {{.*}} %V2F64 = fadd
+ ; AVX: cost of 1 {{.*}} %V2F64 = fadd
+ ; AVX2: cost of 1 {{.*}} %V2F64 = fadd
+ ; AVX512: cost of 1 {{.*}} %V2F64 = fadd
%V2F64 = fadd <2 x double> undef, undef
; SSE2: cost of 4 {{.*}} %V4F64 = fadd
- ; SSE42: cost of 4 {{.*}} %V4F64 = fadd
+ ; SSE42: cost of 2 {{.*}} %V4F64 = fadd
; AVX: cost of 2 {{.*}} %V4F64 = fadd
- ; AVX2: cost of 2 {{.*}} %V4F64 = fadd
- ; AVX512: cost of 2 {{.*}} %V4F64 = fadd
+ ; AVX2: cost of 1 {{.*}} %V4F64 = fadd
+ ; AVX512: cost of 1 {{.*}} %V4F64 = fadd
%V4F64 = fadd <4 x double> undef, undef
; SSE2: cost of 8 {{.*}} %V8F64 = fadd
- ; SSE42: cost of 8 {{.*}} %V8F64 = fadd
+ ; SSE42: cost of 4 {{.*}} %V8F64 = fadd
; AVX: cost of 4 {{.*}} %V8F64 = fadd
- ; AVX2: cost of 4 {{.*}} %V8F64 = fadd
- ; AVX512: cost of 2 {{.*}} %V8F64 = fadd
+ ; AVX2: cost of 2 {{.*}} %V8F64 = fadd
+ ; AVX512: cost of 1 {{.*}} %V8F64 = fadd
%V8F64 = fadd <8 x double> undef, undef
ret i32 undef
@@ -66,53 +66,53 @@ define i32 @fadd(i32 %arg) {
; CHECK-LABEL: 'fsub'
define i32 @fsub(i32 %arg) {
; SSE2: cost of 2 {{.*}} %F32 = fsub
- ; SSE42: cost of 2 {{.*}} %F32 = fsub
- ; AVX: cost of 2 {{.*}} %F32 = fsub
- ; AVX2: cost of 2 {{.*}} %F32 = fsub
- ; AVX512: cost of 2 {{.*}} %F32 = fsub
+ ; SSE42: cost of 1 {{.*}} %F32 = fsub
+ ; AVX: cost of 1 {{.*}} %F32 = fsub
+ ; AVX2: cost of 1 {{.*}} %F32 = fsub
+ ; AVX512: cost of 1 {{.*}} %F32 = fsub
%F32 = fsub float undef, undef
; SSE2: cost of 2 {{.*}} %V4F32 = fsub
- ; SSE42: cost of 2 {{.*}} %V4F32 = fsub
- ; AVX: cost of 2 {{.*}} %V4F32 = fsub
- ; AVX2: cost of 2 {{.*}} %V4F32 = fsub
- ; AVX512: cost of 2 {{.*}} %V4F32 = fsub
+ ; SSE42: cost of 1 {{.*}} %V4F32 = fsub
+ ; AVX: cost of 1 {{.*}} %V4F32 = fsub
+ ; AVX2: cost of 1 {{.*}} %V4F32 = fsub
+ ; AVX512: cost of 1 {{.*}} %V4F32 = fsub
%V4F32 = fsub <4 x float> undef, undef
; SSE2: cost of 4 {{.*}} %V8F32 = fsub
- ; SSE42: cost of 4 {{.*}} %V8F32 = fsub
+ ; SSE42: cost of 2 {{.*}} %V8F32 = fsub
; AVX: cost of 2 {{.*}} %V8F32 = fsub
- ; AVX2: cost of 2 {{.*}} %V8F32 = fsub
- ; AVX512: cost of 2 {{.*}} %V8F32 = fsub
+ ; AVX2: cost of 1 {{.*}} %V8F32 = fsub
+ ; AVX512: cost of 1 {{.*}} %V8F32 = fsub
%V8F32 = fsub <8 x float> undef, undef
; SSE2: cost of 8 {{.*}} %V16F32 = fsub
- ; SSE42: cost of 8 {{.*}} %V16F32 = fsub
+ ; SSE42: cost of 4 {{.*}} %V16F32 = fsub
; AVX: cost of 4 {{.*}} %V16F32 = fsub
- ; AVX2: cost of 4 {{.*}} %V16F32 = fsub
- ; AVX512: cost of 2 {{.*}} %V16F32 = fsub
+ ; AVX2: cost of 2 {{.*}} %V16F32 = fsub
+ ; AVX512: cost of 1 {{.*}} %V16F32 = fsub
%V16F32 = fsub <16 x float> undef, undef
; SSE2: cost of 2 {{.*}} %F64 = fsub
- ; SSE42: cost of 2 {{.*}} %F64 = fsub
- ; AVX: cost of 2 {{.*}} %F64 = fsub
- ; AVX2: cost of 2 {{.*}} %F64 = fsub
- ; AVX512: cost of 2 {{.*}} %F64 = fsub
+ ; SSE42: cost of 1 {{.*}} %F64 = fsub
+ ; AVX: cost of 1 {{.*}} %F64 = fsub
+ ; AVX2: cost of 1 {{.*}} %F64 = fsub
+ ; AVX512: cost of 1 {{.*}} %F64 = fsub
%F64 = fsub double undef, undef
; SSE2: cost of 2 {{.*}} %V2F64 = fsub
- ; SSE42: cost of 2 {{.*}} %V2F64 = fsub
- ; AVX: cost of 2 {{.*}} %V2F64 = fsub
- ; AVX2: cost of 2 {{.*}} %V2F64 = fsub
- ; AVX512: cost of 2 {{.*}} %V2F64 = fsub
+ ; SSE42: cost of 1 {{.*}} %V2F64 = fsub
+ ; AVX: cost of 1 {{.*}} %V2F64 = fsub
+ ; AVX2: cost of 1 {{.*}} %V2F64 = fsub
+ ; AVX512: cost of 1 {{.*}} %V2F64 = fsub
%V2F64 = fsub <2 x double> undef, undef
; SSE2: cost of 4 {{.*}} %V4F64 = fsub
- ; SSE42: cost of 4 {{.*}} %V4F64 = fsub
+ ; SSE42: cost of 2 {{.*}} %V4F64 = fsub
; AVX: cost of 2 {{.*}} %V4F64 = fsub
- ; AVX2: cost of 2 {{.*}} %V4F64 = fsub
- ; AVX512: cost of 2 {{.*}} %V4F64 = fsub
+ ; AVX2: cost of 1 {{.*}} %V4F64 = fsub
+ ; AVX512: cost of 1 {{.*}} %V4F64 = fsub
%V4F64 = fsub <4 x double> undef, undef
; SSE2: cost of 8 {{.*}} %V8F64 = fsub
- ; SSE42: cost of 8 {{.*}} %V8F64 = fsub
+ ; SSE42: cost of 4 {{.*}} %V8F64 = fsub
; AVX: cost of 4 {{.*}} %V8F64 = fsub
- ; AVX2: cost of 4 {{.*}} %V8F64 = fsub
- ; AVX512: cost of 2 {{.*}} %V8F64 = fsub
+ ; AVX2: cost of 2 {{.*}} %V8F64 = fsub
+ ; AVX512: cost of 1 {{.*}} %V8F64 = fsub
%V8F64 = fsub <8 x double> undef, undef
ret i32 undef
@@ -121,53 +121,53 @@ define i32 @fsub(i32 %arg) {
; CHECK-LABEL: 'fmul'
define i32 @fmul(i32 %arg) {
; SSE2: cost of 2 {{.*}} %F32 = fmul
- ; SSE42: cost of 2 {{.*}} %F32 = fmul
- ; AVX: cost of 2 {{.*}} %F32 = fmul
- ; AVX2: cost of 2 {{.*}} %F32 = fmul
- ; AVX512: cost of 2 {{.*}} %F32 = fmul
+ ; SSE42: cost of 1 {{.*}} %F32 = fmul
+ ; AVX: cost of 1 {{.*}} %F32 = fmul
+ ; AVX2: cost of 1 {{.*}} %F32 = fmul
+ ; AVX512: cost of 1 {{.*}} %F32 = fmul
%F32 = fmul float undef, undef
; SSE2: cost of 2 {{.*}} %V4F32 = fmul
- ; SSE42: cost of 2 {{.*}} %V4F32 = fmul
- ; AVX: cost of 2 {{.*}} %V4F32 = fmul
- ; AVX2: cost of 2 {{.*}} %V4F32 = fmul
- ; AVX512: cost of 2 {{.*}} %V4F32 = fmul
+ ; SSE42: cost of 1 {{.*}} %V4F32 = fmul
+ ; AVX: cost of 1 {{.*}} %V4F32 = fmul
+ ; AVX2: cost of 1 {{.*}} %V4F32 = fmul
+ ; AVX512: cost of 1 {{.*}} %V4F32 = fmul
%V4F32 = fmul <4 x float> undef, undef
; SSE2: cost of 4 {{.*}} %V8F32 = fmul
- ; SSE42: cost of 4 {{.*}} %V8F32 = fmul
+ ; SSE42: cost of 2 {{.*}} %V8F32 = fmul
; AVX: cost of 2 {{.*}} %V8F32 = fmul
- ; AVX2: cost of 2 {{.*}} %V8F32 = fmul
- ; AVX512: cost of 2 {{.*}} %V8F32 = fmul
+ ; AVX2: cost of 1 {{.*}} %V8F32 = fmul
+ ; AVX512: cost of 1 {{.*}} %V8F32 = fmul
%V8F32 = fmul <8 x float> undef, undef
; SSE2: cost of 8 {{.*}} %V16F32 = fmul
- ; SSE42: cost of 8 {{.*}} %V16F32 = fmul
+ ; SSE42: cost of 4 {{.*}} %V16F32 = fmul
; AVX: cost of 4 {{.*}} %V16F32 = fmul
- ; AVX2: cost of 4 {{.*}} %V16F32 = fmul
- ; AVX512: cost of 2 {{.*}} %V16F32 = fmul
+ ; AVX2: cost of 2 {{.*}} %V16F32 = fmul
+ ; AVX512: cost of 1 {{.*}} %V16F32 = fmul
%V16F32 = fmul <16 x float> undef, undef
; SSE2: cost of 2 {{.*}} %F64 = fmul
- ; SSE42: cost of 2 {{.*}} %F64 = fmul
- ; AVX: cost of 2 {{.*}} %F64 = fmul
- ; AVX2: cost of 2 {{.*}} %F64 = fmul
- ; AVX512: cost of 2 {{.*}} %F64 = fmul
+ ; SSE42: cost of 1 {{.*}} %F64 = fmul
+ ; AVX: cost of 1 {{.*}} %F64 = fmul
+ ; AVX2: cost of 1 {{.*}} %F64 = fmul
+ ; AVX512: cost of 1 {{.*}} %F64 = fmul
%F64 = fmul double undef, undef
; SSE2: cost of 2 {{.*}} %V2F64 = fmul
- ; SSE42: cost of 2 {{.*}} %V2F64 = fmul
- ; AVX: cost of 2 {{.*}} %V2F64 = fmul
- ; AVX2: cost of 2 {{.*}} %V2F64 = fmul
- ; AVX512: cost of 2 {{.*}} %V2F64 = fmul
+ ; SSE42: cost of 1 {{.*}} %V2F64 = fmul
+ ; AVX: cost of 1 {{.*}} %V2F64 = fmul
+ ; AVX2: cost of 1 {{.*}} %V2F64 = fmul
+ ; AVX512: cost of 1 {{.*}} %V2F64 = fmul
%V2F64 = fmul <2 x double> undef, undef
; SSE2: cost of 4 {{.*}} %V4F64 = fmul
- ; SSE42: cost of 4 {{.*}} %V4F64 = fmul
+ ; SSE42: cost of 2 {{.*}} %V4F64 = fmul
; AVX: cost of 2 {{.*}} %V4F64 = fmul
- ; AVX2: cost of 2 {{.*}} %V4F64 = fmul
- ; AVX512: cost of 2 {{.*}} %V4F64 = fmul
+ ; AVX2: cost of 1 {{.*}} %V4F64 = fmul
+ ; AVX512: cost of 1 {{.*}} %V4F64 = fmul
%V4F64 = fmul <4 x double> undef, undef
; SSE2: cost of 8 {{.*}} %V8F64 = fmul
- ; SSE42: cost of 8 {{.*}} %V8F64 = fmul
+ ; SSE42: cost of 4 {{.*}} %V8F64 = fmul
; AVX: cost of 4 {{.*}} %V8F64 = fmul
- ; AVX2: cost of 4 {{.*}} %V8F64 = fmul
- ; AVX512: cost of 2 {{.*}} %V8F64 = fmul
+ ; AVX2: cost of 2 {{.*}} %V8F64 = fmul
+ ; AVX512: cost of 1 {{.*}} %V8F64 = fmul
%V8F64 = fmul <8 x double> undef, undef
ret i32 undef
Modified: llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll Mon Feb 26 14:10:17 2018
@@ -81,7 +81,7 @@ for.end:
; CORE2: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
; COREI7: Printing analysis 'Cost Model Analysis' for function 'test3':
-; COREI7: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+; COREI7: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
}
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll Mon Feb 26 14:10:17 2018
@@ -5,15 +5,12 @@ define float @jacobi(float* %p, float %x
; CHECK-LABEL: @jacobi(
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr float, float* [[P:%.*]], i64 1
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, float* [[P]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[X:%.*]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Y:%.*]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[TMP6]], [[Z:%.*]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[TMP7]], [[ADD1]]
+; CHECK-NEXT: [[P1:%.*]] = load float, float* [[GEP1]]
+; CHECK-NEXT: [[P2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT: [[MUL1:%.*]] = fmul float [[P1]], [[X:%.*]]
+; CHECK-NEXT: [[MUL2:%.*]] = fmul float [[P2]], [[Y:%.*]]
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[MUL1]], [[Z:%.*]]
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[MUL2]], [[ADD1]]
; CHECK-NEXT: ret float [[ADD2]]
;
%gep1 = getelementptr float, float* %p, i64 1
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll Mon Feb 26 14:10:17 2018
@@ -19,20 +19,19 @@ define i32 @test(double* nocapture %G) {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> <double 4.000000e+00, double 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP2]]
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT: [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP4]], i32 2
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[MUL11]], i32 3
-; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x double> <double 1.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[TMP9]]
+; CHECK-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
+; CHECK-NEXT: [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[G]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP10]], <4 x double>* [[TMP11]], align 8
+; CHECK-NEXT: store double [[ADD12]], double* [[ARRAYIDX13]], align 8
; CHECK-NEXT: ret i32 undef
;
entry:
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll Mon Feb 26 14:10:17 2018
@@ -730,28 +730,26 @@ define void @foo(float* nocapture readon
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[TMP0]], 1
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 2
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP0]], 3
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
; CHECK-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
; CHECK: for.body16.lr.ph:
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
; CHECK-NEXT: br label [[FOR_BODY16:%.*]]
; CHECK: for.cond.cleanup15:
-; CHECK-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
; CHECK-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
; CHECK-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
; CHECK-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
@@ -760,36 +758,26 @@ define void @foo(float* nocapture readon
; CHECK-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
; CHECK-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
; CHECK: for.body16:
+; CHECK-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; CHECK-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
; CHECK-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT: [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1
-; CHECK-NEXT: [[TMP16:%.*]] = fmul fast <2 x float> <float 0x3FF19999A0000000, float 0xBFF3333340000000>, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT: [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]]
-; CHECK-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]]
+; CHECK-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; CHECK-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; CHECK-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; CHECK-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; CHECK-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
; CHECK-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP11]]
-; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float undef, undef
-; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
-; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: [[TMP21]] = fadd fast float [[TMP20]], [[MUL20]]
-; CHECK-NEXT: [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
+; CHECK-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; CHECK-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; CHECK-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
; CHECK-NEXT: [[INC]] = add nuw i32 [[J_098]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
-; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP12]], i32 1
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP21]], i32 2
-; CHECK-NEXT: [[TMP25]] = extractelement <4 x float> [[TMP11]], i32 2
-; CHECK-NEXT: [[TMP26]] = insertelement <4 x float> [[TMP24]], float [[TMP25]], i32 3
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
;
; STORE-LABEL: @foo(
@@ -802,28 +790,26 @@ define void @foo(float* nocapture readon
; STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
; STORE-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
; STORE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; STORE-NEXT: [[TMP1:%.*]] = or i64 [[TMP0]], 1
-; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
-; STORE-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 2
-; STORE-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; STORE-NEXT: [[TMP3:%.*]] = or i64 [[TMP0]], 3
-; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
-; STORE-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; STORE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; STORE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
-; STORE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
-; STORE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
-; STORE-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; STORE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; STORE-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; STORE-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; STORE-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; STORE-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; STORE-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; STORE-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; STORE-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
; STORE-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
; STORE: for.body16.lr.ph:
; STORE-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; STORE-NEXT: [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4
+; STORE-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
; STORE-NEXT: br label [[FOR_BODY16:%.*]]
; STORE: for.cond.cleanup15:
-; STORE-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
; STORE-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
; STORE-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
; STORE-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
@@ -832,36 +818,26 @@ define void @foo(float* nocapture readon
; STORE-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
; STORE-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
; STORE: for.body16:
+; STORE-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; STORE-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
; STORE-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT: [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT: [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0
-; STORE-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; STORE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0
-; STORE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1
-; STORE-NEXT: [[TMP16:%.*]] = fmul fast <2 x float> <float 0x3FF19999A0000000, float 0xBFF3333340000000>, [[TMP15]]
-; STORE-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; STORE-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; STORE-NEXT: [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]]
-; STORE-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]]
+; STORE-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; STORE-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; STORE-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; STORE-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; STORE-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; STORE-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
; STORE-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; STORE-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP11]]
-; STORE-NEXT: [[ADD2293:%.*]] = fadd fast float undef, undef
-; STORE-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
-; STORE-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
-; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[RDX_SHUF]]
-; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; STORE-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; STORE-NEXT: [[TMP21]] = fadd fast float [[TMP20]], [[MUL20]]
-; STORE-NEXT: [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
+; STORE-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; STORE-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; STORE-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; STORE-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; STORE-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; STORE-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; STORE-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; STORE-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
; STORE-NEXT: [[INC]] = add nuw i32 [[J_098]], 1
; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
-; STORE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
-; STORE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP12]], i32 1
-; STORE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP21]], i32 2
-; STORE-NEXT: [[TMP25]] = extractelement <4 x float> [[TMP11]], i32 2
-; STORE-NEXT: [[TMP26]] = insertelement <4 x float> [[TMP24]], float [[TMP25]], i32 3
; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
;
entry:
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll Mon Feb 26 14:10:17 2018
@@ -9,40 +9,33 @@ define void @foo (%struct.complex* %A,
; CHECK-NEXT: [[TMP0:%.*]] = add i64 256, 0
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP25:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP24:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP6]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> undef, float [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP16]], i32 1
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP10]], i32 1
-; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x float> [[TMP17]], [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = fsub <2 x float> [[TMP13]], [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x float> [[TMP13]], [[TMP20]]
-; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x float> [[TMP21]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP24]] = fadd <2 x float> [[TMP2]], [[TMP23]]
-; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], [[TMP0]]
-; CHECK-NEXT: br i1 [[TMP26]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]]
+; CHECK-NEXT: [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]]
+; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]]
+; CHECK-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP18]] = fadd float [[TMP3]], [[TMP14]]
+; CHECK-NEXT: [[TMP19]] = fadd float [[TMP2]], [[TMP17]]
+; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
; CHECK: exit:
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
-; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
-; CHECK-NEXT: store float [[TMP28]], float* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
-; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
-; CHECK-NEXT: store float [[TMP30]], float* [[TMP29]], align 4
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT: store float [[TMP18]], float* [[TMP22]], align 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
+; CHECK-NEXT: store float [[TMP19]], float* [[TMP23]], align 4
; CHECK-NEXT: ret void
;
entry:
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll?rev=326133&r1=326132&r2=326133&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll Mon Feb 26 14:10:17 2018
@@ -64,17 +64,15 @@ define void @test_volatile_load(double*
; CHECK-LABEL: @test_volatile_load(
; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul double [[I0]], [[I1]]
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[I0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[I1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT: [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT: store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT: store double [[MUL5]], double* [[ARRAYIDX5]], align 8
; CHECK-NEXT: ret void
;
%i0 = load volatile double, double* %a, align 8
More information about the llvm-commits
mailing list