[llvm] r325515 - [TTI CostModel] change default cost of FP ops to 1 (PR36280)

Sanjay Patel via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 19 08:11:44 PST 2018


Author: spatel
Date: Mon Feb 19 08:11:44 2018
New Revision: 325515

URL: http://llvm.org/viewvc/llvm-project?rev=325515&view=rev
Log:
[TTI CostModel] change default cost of FP ops to 1 (PR36280)

This change was mentioned at least as far back as:
https://bugs.llvm.org/show_bug.cgi?id=26837#c26
...and I found a real program that is harmed by this: 
Himeno running on AMD Jaguar gets 6% slower with SLP vectorization:
https://bugs.llvm.org/show_bug.cgi?id=36280
...but the change here appears to solve that bug only accidentally.

The div/rem costs for x86 look very wrong in some cases, but that's already true, 
so we can fix those in follow-up patches. There's also evidence that more cost model
changes are needed to solve SLP problems as shown in D42981, but that's an independent 
problem (though the solution may be adjusted after this change is made).

Differential Revision: https://reviews.llvm.org/D43079

Modified:
    llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
    llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll
    llvm/trunk/test/Analysis/CostModel/X86/reduction.ll
    llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll

Modified: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h (original)
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h Mon Feb 19 08:11:44 2018
@@ -488,10 +488,12 @@ public:
 
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
-    bool IsFloat = Ty->isFPOrFPVectorTy();
-    // Assume that floating point arithmetic operations cost twice as much as
-    // integer operations.
-    unsigned OpCost = (IsFloat ? 2 : 1);
+    // Assume that the throughput of any integer or floating-point math
+    // operation is the same and maximal (disregarding free operations).
+    // That is, operations with less throughput should have a relative cost
+    // greater than 1. Targets should override this assumption when they can
+    // provide more accurate information.
+    unsigned OpCost = 1;
 
     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       // The operation is legal. Assume it costs 1.

Modified: llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll Mon Feb 19 08:11:44 2018
@@ -10,54 +10,54 @@ target triple = "x86_64-apple-macosx10.8
 
 ; CHECK-LABEL: 'fadd'
 define i32 @fadd(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = fadd
-  ; SSE42: cost of 2 {{.*}} %F32 = fadd
-  ; AVX: cost of 2 {{.*}} %F32 = fadd
-  ; AVX2: cost of 2 {{.*}} %F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %F32 = fadd
+  ; SSE2: cost of 1 {{.*}} %F32 = fadd
+  ; SSE42: cost of 1 {{.*}} %F32 = fadd
+  ; AVX: cost of 1 {{.*}} %F32 = fadd
+  ; AVX2: cost of 1 {{.*}} %F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %F32 = fadd
   %F32 = fadd float undef, undef
-  ; SSE2: cost of 2 {{.*}} %V4F32 = fadd
-  ; SSE42: cost of 2 {{.*}} %V4F32 = fadd
-  ; AVX: cost of 2 {{.*}} %V4F32 = fadd
-  ; AVX2: cost of 2 {{.*}} %V4F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %V4F32 = fadd
+  ; SSE2: cost of 1 {{.*}} %V4F32 = fadd
+  ; SSE42: cost of 1 {{.*}} %V4F32 = fadd
+  ; AVX: cost of 1 {{.*}} %V4F32 = fadd
+  ; AVX2: cost of 1 {{.*}} %V4F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %V4F32 = fadd
   %V4F32 = fadd <4 x float> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V8F32 = fadd
-  ; SSE42: cost of 4 {{.*}} %V8F32 = fadd
-  ; AVX: cost of 2 {{.*}} %V8F32 = fadd
-  ; AVX2: cost of 2 {{.*}} %V8F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %V8F32 = fadd
+  ; SSE2: cost of 2 {{.*}} %V8F32 = fadd
+  ; SSE42: cost of 2 {{.*}} %V8F32 = fadd
+  ; AVX: cost of 1 {{.*}} %V8F32 = fadd
+  ; AVX2: cost of 1 {{.*}} %V8F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %V8F32 = fadd
   %V8F32 = fadd <8 x float> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V16F32 = fadd
-  ; SSE42: cost of 8 {{.*}} %V16F32 = fadd
-  ; AVX: cost of 4 {{.*}} %V16F32 = fadd
-  ; AVX2: cost of 4 {{.*}} %V16F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fadd
+  ; SSE2: cost of 4 {{.*}} %V16F32 = fadd
+  ; SSE42: cost of 4 {{.*}} %V16F32 = fadd
+  ; AVX: cost of 2 {{.*}} %V16F32 = fadd
+  ; AVX2: cost of 2 {{.*}} %V16F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fadd
   %V16F32 = fadd <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = fadd
-  ; SSE42: cost of 2 {{.*}} %F64 = fadd
-  ; AVX: cost of 2 {{.*}} %F64 = fadd
-  ; AVX2: cost of 2 {{.*}} %F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %F64 = fadd
+  ; SSE2: cost of 1 {{.*}} %F64 = fadd
+  ; SSE42: cost of 1 {{.*}} %F64 = fadd
+  ; AVX: cost of 1 {{.*}} %F64 = fadd
+  ; AVX2: cost of 1 {{.*}} %F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %F64 = fadd
   %F64 = fadd double undef, undef
-  ; SSE2: cost of 2 {{.*}} %V2F64 = fadd
-  ; SSE42: cost of 2 {{.*}} %V2F64 = fadd
-  ; AVX: cost of 2 {{.*}} %V2F64 = fadd
-  ; AVX2: cost of 2 {{.*}} %V2F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %V2F64 = fadd
+  ; SSE2: cost of 1 {{.*}} %V2F64 = fadd
+  ; SSE42: cost of 1 {{.*}} %V2F64 = fadd
+  ; AVX: cost of 1 {{.*}} %V2F64 = fadd
+  ; AVX2: cost of 1 {{.*}} %V2F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %V2F64 = fadd
   %V2F64 = fadd <2 x double> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V4F64 = fadd
-  ; SSE42: cost of 4 {{.*}} %V4F64 = fadd
-  ; AVX: cost of 2 {{.*}} %V4F64 = fadd
-  ; AVX2: cost of 2 {{.*}} %V4F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %V4F64 = fadd
+  ; SSE2: cost of 2 {{.*}} %V4F64 = fadd
+  ; SSE42: cost of 2 {{.*}} %V4F64 = fadd
+  ; AVX: cost of 1 {{.*}} %V4F64 = fadd
+  ; AVX2: cost of 1 {{.*}} %V4F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %V4F64 = fadd
   %V4F64 = fadd <4 x double> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V8F64 = fadd
-  ; SSE42: cost of 8 {{.*}} %V8F64 = fadd
-  ; AVX: cost of 4 {{.*}} %V8F64 = fadd
-  ; AVX2: cost of 4 {{.*}} %V8F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fadd
+  ; SSE2: cost of 4 {{.*}} %V8F64 = fadd
+  ; SSE42: cost of 4 {{.*}} %V8F64 = fadd
+  ; AVX: cost of 2 {{.*}} %V8F64 = fadd
+  ; AVX2: cost of 2 {{.*}} %V8F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fadd
   %V8F64 = fadd <8 x double> undef, undef
 
   ret i32 undef
@@ -65,54 +65,54 @@ define i32 @fadd(i32 %arg) {
 
 ; CHECK-LABEL: 'fsub'
 define i32 @fsub(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = fsub
-  ; SSE42: cost of 2 {{.*}} %F32 = fsub
-  ; AVX: cost of 2 {{.*}} %F32 = fsub
-  ; AVX2: cost of 2 {{.*}} %F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %F32 = fsub
+  ; SSE2: cost of 1 {{.*}} %F32 = fsub
+  ; SSE42: cost of 1 {{.*}} %F32 = fsub
+  ; AVX: cost of 1 {{.*}} %F32 = fsub
+  ; AVX2: cost of 1 {{.*}} %F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %F32 = fsub
   %F32 = fsub float undef, undef
-  ; SSE2: cost of 2 {{.*}} %V4F32 = fsub
-  ; SSE42: cost of 2 {{.*}} %V4F32 = fsub
-  ; AVX: cost of 2 {{.*}} %V4F32 = fsub
-  ; AVX2: cost of 2 {{.*}} %V4F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %V4F32 = fsub
+  ; SSE2: cost of 1 {{.*}} %V4F32 = fsub
+  ; SSE42: cost of 1 {{.*}} %V4F32 = fsub
+  ; AVX: cost of 1 {{.*}} %V4F32 = fsub
+  ; AVX2: cost of 1 {{.*}} %V4F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %V4F32 = fsub
   %V4F32 = fsub <4 x float> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V8F32 = fsub
-  ; SSE42: cost of 4 {{.*}} %V8F32 = fsub
-  ; AVX: cost of 2 {{.*}} %V8F32 = fsub
-  ; AVX2: cost of 2 {{.*}} %V8F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %V8F32 = fsub
+  ; SSE2: cost of 2 {{.*}} %V8F32 = fsub
+  ; SSE42: cost of 2 {{.*}} %V8F32 = fsub
+  ; AVX: cost of 1 {{.*}} %V8F32 = fsub
+  ; AVX2: cost of 1 {{.*}} %V8F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %V8F32 = fsub
   %V8F32 = fsub <8 x float> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V16F32 = fsub
-  ; SSE42: cost of 8 {{.*}} %V16F32 = fsub
-  ; AVX: cost of 4 {{.*}} %V16F32 = fsub
-  ; AVX2: cost of 4 {{.*}} %V16F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fsub
+  ; SSE2: cost of 4 {{.*}} %V16F32 = fsub
+  ; SSE42: cost of 4 {{.*}} %V16F32 = fsub
+  ; AVX: cost of 2 {{.*}} %V16F32 = fsub
+  ; AVX2: cost of 2 {{.*}} %V16F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fsub
   %V16F32 = fsub <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = fsub
-  ; SSE42: cost of 2 {{.*}} %F64 = fsub
-  ; AVX: cost of 2 {{.*}} %F64 = fsub
-  ; AVX2: cost of 2 {{.*}} %F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %F64 = fsub
+  ; SSE2: cost of 1 {{.*}} %F64 = fsub
+  ; SSE42: cost of 1 {{.*}} %F64 = fsub
+  ; AVX: cost of 1 {{.*}} %F64 = fsub
+  ; AVX2: cost of 1 {{.*}} %F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %F64 = fsub
   %F64 = fsub double undef, undef
-  ; SSE2: cost of 2 {{.*}} %V2F64 = fsub
-  ; SSE42: cost of 2 {{.*}} %V2F64 = fsub
-  ; AVX: cost of 2 {{.*}} %V2F64 = fsub
-  ; AVX2: cost of 2 {{.*}} %V2F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %V2F64 = fsub
+  ; SSE2: cost of 1 {{.*}} %V2F64 = fsub
+  ; SSE42: cost of 1 {{.*}} %V2F64 = fsub
+  ; AVX: cost of 1 {{.*}} %V2F64 = fsub
+  ; AVX2: cost of 1 {{.*}} %V2F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %V2F64 = fsub
   %V2F64 = fsub <2 x double> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V4F64 = fsub
-  ; SSE42: cost of 4 {{.*}} %V4F64 = fsub
-  ; AVX: cost of 2 {{.*}} %V4F64 = fsub
-  ; AVX2: cost of 2 {{.*}} %V4F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %V4F64 = fsub
+  ; SSE2: cost of 2 {{.*}} %V4F64 = fsub
+  ; SSE42: cost of 2 {{.*}} %V4F64 = fsub
+  ; AVX: cost of 1 {{.*}} %V4F64 = fsub
+  ; AVX2: cost of 1 {{.*}} %V4F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %V4F64 = fsub
   %V4F64 = fsub <4 x double> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V8F64 = fsub
-  ; SSE42: cost of 8 {{.*}} %V8F64 = fsub
-  ; AVX: cost of 4 {{.*}} %V8F64 = fsub
-  ; AVX2: cost of 4 {{.*}} %V8F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fsub
+  ; SSE2: cost of 4 {{.*}} %V8F64 = fsub
+  ; SSE42: cost of 4 {{.*}} %V8F64 = fsub
+  ; AVX: cost of 2 {{.*}} %V8F64 = fsub
+  ; AVX2: cost of 2 {{.*}} %V8F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fsub
   %V8F64 = fsub <8 x double> undef, undef
 
   ret i32 undef
@@ -120,54 +120,54 @@ define i32 @fsub(i32 %arg) {
 
 ; CHECK-LABEL: 'fmul'
 define i32 @fmul(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = fmul
-  ; SSE42: cost of 2 {{.*}} %F32 = fmul
-  ; AVX: cost of 2 {{.*}} %F32 = fmul
-  ; AVX2: cost of 2 {{.*}} %F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %F32 = fmul
+  ; SSE2: cost of 1 {{.*}} %F32 = fmul
+  ; SSE42: cost of 1 {{.*}} %F32 = fmul
+  ; AVX: cost of 1 {{.*}} %F32 = fmul
+  ; AVX2: cost of 1 {{.*}} %F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %F32 = fmul
   %F32 = fmul float undef, undef
-  ; SSE2: cost of 2 {{.*}} %V4F32 = fmul
-  ; SSE42: cost of 2 {{.*}} %V4F32 = fmul
-  ; AVX: cost of 2 {{.*}} %V4F32 = fmul
-  ; AVX2: cost of 2 {{.*}} %V4F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %V4F32 = fmul
+  ; SSE2: cost of 1 {{.*}} %V4F32 = fmul
+  ; SSE42: cost of 1 {{.*}} %V4F32 = fmul
+  ; AVX: cost of 1 {{.*}} %V4F32 = fmul
+  ; AVX2: cost of 1 {{.*}} %V4F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %V4F32 = fmul
   %V4F32 = fmul <4 x float> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V8F32 = fmul
-  ; SSE42: cost of 4 {{.*}} %V8F32 = fmul
-  ; AVX: cost of 2 {{.*}} %V8F32 = fmul
-  ; AVX2: cost of 2 {{.*}} %V8F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %V8F32 = fmul
+  ; SSE2: cost of 2 {{.*}} %V8F32 = fmul
+  ; SSE42: cost of 2 {{.*}} %V8F32 = fmul
+  ; AVX: cost of 1 {{.*}} %V8F32 = fmul
+  ; AVX2: cost of 1 {{.*}} %V8F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %V8F32 = fmul
   %V8F32 = fmul <8 x float> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V16F32 = fmul
-  ; SSE42: cost of 8 {{.*}} %V16F32 = fmul
-  ; AVX: cost of 4 {{.*}} %V16F32 = fmul
-  ; AVX2: cost of 4 {{.*}} %V16F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fmul
+  ; SSE2: cost of 4 {{.*}} %V16F32 = fmul
+  ; SSE42: cost of 4 {{.*}} %V16F32 = fmul
+  ; AVX: cost of 2 {{.*}} %V16F32 = fmul
+  ; AVX2: cost of 2 {{.*}} %V16F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fmul
   %V16F32 = fmul <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = fmul
-  ; SSE42: cost of 2 {{.*}} %F64 = fmul
-  ; AVX: cost of 2 {{.*}} %F64 = fmul
-  ; AVX2: cost of 2 {{.*}} %F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %F64 = fmul
+  ; SSE2: cost of 1 {{.*}} %F64 = fmul
+  ; SSE42: cost of 1 {{.*}} %F64 = fmul
+  ; AVX: cost of 1 {{.*}} %F64 = fmul
+  ; AVX2: cost of 1 {{.*}} %F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %F64 = fmul
   %F64 = fmul double undef, undef
-  ; SSE2: cost of 2 {{.*}} %V2F64 = fmul
-  ; SSE42: cost of 2 {{.*}} %V2F64 = fmul
-  ; AVX: cost of 2 {{.*}} %V2F64 = fmul
-  ; AVX2: cost of 2 {{.*}} %V2F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %V2F64 = fmul
+  ; SSE2: cost of 1 {{.*}} %V2F64 = fmul
+  ; SSE42: cost of 1 {{.*}} %V2F64 = fmul
+  ; AVX: cost of 1 {{.*}} %V2F64 = fmul
+  ; AVX2: cost of 1 {{.*}} %V2F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %V2F64 = fmul
   %V2F64 = fmul <2 x double> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V4F64 = fmul
-  ; SSE42: cost of 4 {{.*}} %V4F64 = fmul
-  ; AVX: cost of 2 {{.*}} %V4F64 = fmul
-  ; AVX2: cost of 2 {{.*}} %V4F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %V4F64 = fmul
+  ; SSE2: cost of 2 {{.*}} %V4F64 = fmul
+  ; SSE42: cost of 2 {{.*}} %V4F64 = fmul
+  ; AVX: cost of 1 {{.*}} %V4F64 = fmul
+  ; AVX2: cost of 1 {{.*}} %V4F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %V4F64 = fmul
   %V4F64 = fmul <4 x double> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V8F64 = fmul
-  ; SSE42: cost of 8 {{.*}} %V8F64 = fmul
-  ; AVX: cost of 4 {{.*}} %V8F64 = fmul
-  ; AVX2: cost of 4 {{.*}} %V8F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fmul
+  ; SSE2: cost of 4 {{.*}} %V8F64 = fmul
+  ; SSE42: cost of 4 {{.*}} %V8F64 = fmul
+  ; AVX: cost of 2 {{.*}} %V8F64 = fmul
+  ; AVX2: cost of 2 {{.*}} %V8F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fmul
   %V8F64 = fmul <8 x double> undef, undef
 
   ret i32 undef
@@ -197,7 +197,7 @@ define i32 @fdiv(i32 %arg) {
   ; SSE42: cost of 56 {{.*}} %V16F32 = fdiv
   ; AVX: cost of 56 {{.*}} %V16F32 = fdiv
   ; AVX2: cost of 28 {{.*}} %V16F32 = fdiv
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fdiv
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fdiv
   %V16F32 = fdiv <16 x float> undef, undef
 
   ; SSE2: cost of 38 {{.*}} %F64 = fdiv
@@ -222,7 +222,7 @@ define i32 @fdiv(i32 %arg) {
   ; SSE42: cost of 88 {{.*}} %V8F64 = fdiv
   ; AVX: cost of 88 {{.*}} %V8F64 = fdiv
   ; AVX2: cost of 56 {{.*}} %V8F64 = fdiv
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fdiv
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fdiv
   %V8F64 = fdiv <8 x double> undef, undef
 
   ret i32 undef
@@ -230,54 +230,54 @@ define i32 @fdiv(i32 %arg) {
 
 ; CHECK-LABEL: 'frem'
 define i32 @frem(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = frem
-  ; SSE42: cost of 2 {{.*}} %F32 = frem
-  ; AVX: cost of 2 {{.*}} %F32 = frem
-  ; AVX2: cost of 2 {{.*}} %F32 = frem
-  ; AVX512: cost of 2 {{.*}} %F32 = frem
+  ; SSE2: cost of 1 {{.*}} %F32 = frem
+  ; SSE42: cost of 1 {{.*}} %F32 = frem
+  ; AVX: cost of 1 {{.*}} %F32 = frem
+  ; AVX2: cost of 1 {{.*}} %F32 = frem
+  ; AVX512: cost of 1 {{.*}} %F32 = frem
   %F32 = frem float undef, undef
-  ; SSE2: cost of 14 {{.*}} %V4F32 = frem
-  ; SSE42: cost of 14 {{.*}} %V4F32 = frem
-  ; AVX: cost of 14 {{.*}} %V4F32 = frem
-  ; AVX2: cost of 14 {{.*}} %V4F32 = frem
-  ; AVX512: cost of 14 {{.*}} %V4F32 = frem
+  ; SSE2: cost of 10 {{.*}} %V4F32 = frem
+  ; SSE42: cost of 10 {{.*}} %V4F32 = frem
+  ; AVX: cost of 10 {{.*}} %V4F32 = frem
+  ; AVX2: cost of 10 {{.*}} %V4F32 = frem
+  ; AVX512: cost of 10 {{.*}} %V4F32 = frem
   %V4F32 = frem <4 x float> undef, undef
-  ; SSE2: cost of 28 {{.*}} %V8F32 = frem
-  ; SSE42: cost of 28 {{.*}} %V8F32 = frem
-  ; AVX: cost of 30 {{.*}} %V8F32 = frem
-  ; AVX2: cost of 30 {{.*}} %V8F32 = frem
-  ; AVX512: cost of 30 {{.*}} %V8F32 = frem
+  ; SSE2: cost of 20 {{.*}} %V8F32 = frem
+  ; SSE42: cost of 20 {{.*}} %V8F32 = frem
+  ; AVX: cost of 22 {{.*}} %V8F32 = frem
+  ; AVX2: cost of 22 {{.*}} %V8F32 = frem
+  ; AVX512: cost of 22 {{.*}} %V8F32 = frem
   %V8F32 = frem <8 x float> undef, undef
-  ; SSE2: cost of 56 {{.*}} %V16F32 = frem
-  ; SSE42: cost of 56 {{.*}} %V16F32 = frem
-  ; AVX: cost of 60 {{.*}} %V16F32 = frem
-  ; AVX2: cost of 60 {{.*}} %V16F32 = frem
-  ; AVX512: cost of 62 {{.*}} %V16F32 = frem
+  ; SSE2: cost of 40 {{.*}} %V16F32 = frem
+  ; SSE42: cost of 40 {{.*}} %V16F32 = frem
+  ; AVX: cost of 44 {{.*}} %V16F32 = frem
+  ; AVX2: cost of 44 {{.*}} %V16F32 = frem
+  ; AVX512: cost of 46 {{.*}} %V16F32 = frem
   %V16F32 = frem <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = frem
-  ; SSE42: cost of 2 {{.*}} %F64 = frem
-  ; AVX: cost of 2 {{.*}} %F64 = frem
-  ; AVX2: cost of 2 {{.*}} %F64 = frem
-  ; AVX512: cost of 2 {{.*}} %F64 = frem
+  ; SSE2: cost of 1 {{.*}} %F64 = frem
+  ; SSE42: cost of 1 {{.*}} %F64 = frem
+  ; AVX: cost of 1 {{.*}} %F64 = frem
+  ; AVX2: cost of 1 {{.*}} %F64 = frem
+  ; AVX512: cost of 1 {{.*}} %F64 = frem
   %F64 = frem double undef, undef
-  ; SSE2: cost of 6 {{.*}} %V2F64 = frem
-  ; SSE42: cost of 6 {{.*}} %V2F64 = frem
-  ; AVX: cost of 6 {{.*}} %V2F64 = frem
-  ; AVX2: cost of 6 {{.*}} %V2F64 = frem
-  ; AVX512: cost of 6 {{.*}} %V2F64 = frem
+  ; SSE2: cost of 4 {{.*}} %V2F64 = frem
+  ; SSE42: cost of 4 {{.*}} %V2F64 = frem
+  ; AVX: cost of 4 {{.*}} %V2F64 = frem
+  ; AVX2: cost of 4 {{.*}} %V2F64 = frem
+  ; AVX512: cost of 4 {{.*}} %V2F64 = frem
   %V2F64 = frem <2 x double> undef, undef
-  ; SSE2: cost of 12 {{.*}} %V4F64 = frem
-  ; SSE42: cost of 12 {{.*}} %V4F64 = frem
-  ; AVX: cost of 14 {{.*}} %V4F64 = frem
-  ; AVX2: cost of 14 {{.*}} %V4F64 = frem
-  ; AVX512: cost of 14 {{.*}} %V4F64 = frem
+  ; SSE2: cost of 8 {{.*}} %V4F64 = frem
+  ; SSE42: cost of 8 {{.*}} %V4F64 = frem
+  ; AVX: cost of 10 {{.*}} %V4F64 = frem
+  ; AVX2: cost of 10 {{.*}} %V4F64 = frem
+  ; AVX512: cost of 10 {{.*}} %V4F64 = frem
   %V4F64 = frem <4 x double> undef, undef
-  ; SSE2: cost of 24 {{.*}} %V8F64 = frem
-  ; SSE42: cost of 24 {{.*}} %V8F64 = frem
-  ; AVX: cost of 28 {{.*}} %V8F64 = frem
-  ; AVX2: cost of 28 {{.*}} %V8F64 = frem
-  ; AVX512: cost of 30 {{.*}} %V8F64 = frem
+  ; SSE2: cost of 16 {{.*}} %V8F64 = frem
+  ; SSE42: cost of 16 {{.*}} %V8F64 = frem
+  ; AVX: cost of 20 {{.*}} %V8F64 = frem
+  ; AVX2: cost of 20 {{.*}} %V8F64 = frem
+  ; AVX512: cost of 22 {{.*}} %V8F64 = frem
   %V8F64 = frem <8 x double> undef, undef
 
   ret i32 undef

Modified: llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/intrinsic-cost.ll Mon Feb 19 08:11:44 2018
@@ -78,10 +78,10 @@ for.end:
   ret void
 
 ; CORE2: Printing analysis 'Cost Model Analysis' for function 'test3':
-; CORE2: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+; CORE2: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
 
 ; COREI7: Printing analysis 'Cost Model Analysis' for function 'test3':
-; COREI7: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+; COREI7: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
 
 }
 

Modified: llvm/trunk/test/Analysis/CostModel/X86/reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/reduction.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/reduction.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/reduction.ll Mon Feb 19 08:11:44 2018
@@ -11,8 +11,8 @@ define fastcc float @reduction_cost_floa
 
 ; Check that we recognize the tree starting at the extractelement as a
 ; reduction.
-; CHECK-LABEL: reduction_cost
-; CHECK:  cost of 9 {{.*}} extractelement
+; CHECK-LABEL: reduction_cost_float
+; CHECK:  cost of 7 {{.*}} extractelement
 
   %r = extractelement <4 x float> %bin.rdx8, i32 0
   ret float %r
@@ -54,7 +54,7 @@ define fastcc float @pairwise_hadd(<4 x
   %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
 
 ; CHECK-LABEL: pairwise_hadd
-; CHECK: cost of 11 {{.*}} extractelement
+; CHECK: cost of 9 {{.*}} extractelement
 
   %r = extractelement <4 x float> %bin.rdx.1, i32 0
   %r2 = fadd float %r, %f1
@@ -74,7 +74,7 @@ define fastcc float @pairwise_hadd_assoc
   %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
 
 ; CHECK-LABEL: pairwise_hadd_assoc
-; CHECK: cost of 11 {{.*}} extractelement
+; CHECK: cost of 9 {{.*}} extractelement
 
   %r = extractelement <4 x float> %bin.rdx.1, i32 0
   %r2 = fadd float %r, %f1

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll Mon Feb 19 08:11:44 2018
@@ -1,11 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s
 
+; FIXME: The intent is that we should be able to vectorize this on x86
+; because that would be profitable, but the cost model says it is not.
+
 ; Two mostly identical functions. The only difference is the presence of
 ; fast-math flags on the second. The loop is a pretty simple reduction:
 
 ; for (int i = 0; i < 32; ++i)
-;   if (arr[i] != 42)
+;   if (arr[i] != 42.0)
 ;     tot += arr[i];
 
 define double @sumIfScalar(double* nocapture readonly %arr) {
@@ -66,41 +69,11 @@ done:
 define double @sumIfVector(double* nocapture readonly %arr) {
 ; CHECK-LABEL: @sumIfVector(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[PREDPHI]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
-; CHECK-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
 ; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
 ; CHECK-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
 ; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
@@ -113,9 +86,9 @@ define double @sumIfVector(double* nocap
 ; CHECK-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
 ; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
 ; CHECK-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
-; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop !2
+; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
 ; CHECK:       done:
-; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
 ; CHECK-NEXT:    ret double [[TOT_NEXT_LCSSA]]
 ;
 entry:

Modified: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll Mon Feb 19 08:11:44 2018
@@ -9,7 +9,7 @@ define void @f(double* %r, double* %w) {
   %add1 = fadd double %f1, %f1
   %w0 = getelementptr inbounds double, double* %w, i64 0
   %w1 = getelementptr inbounds double, double* %w, i64 1
-; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 3
   store double %add0, double* %w0, !dbg !9
   store double %add1, double* %w1
   ret void

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll Mon Feb 19 08:11:44 2018
@@ -1,19 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
 
+; It is not profitable to vectorize this with <2 x float> ops.
+; This is a reduction from the Himeno benchmark.
+; https://bugs.llvm.org/show_bug.cgi?id=36280
+
 define float @jacobi(float* %p, float %x, float %y, float %z) {
 ; CHECK-LABEL: @jacobi(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, float* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, float* [[P]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[X:%.*]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[TMP6]], [[Z:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[TMP7]], [[ADD1]]
+; CHECK-NEXT:    [[P1:%.*]] = load float, float* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul float [[P1]], [[X:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul float [[P2]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[MUL1]], [[Z:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[MUL2]], [[ADD1]]
 ; CHECK-NEXT:    ret float [[ADD2]]
 ;
   %gep1 = getelementptr float, float* %p, i64 1

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/cse.ll Mon Feb 19 08:11:44 2018
@@ -19,20 +19,19 @@ define i32 @test(double* nocapture %G) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> <double 4.000000e+00, double 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[MUL11]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> <double 1.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[TMP9]]
+; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
+; CHECK-NEXT:    [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[G]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP10]], <4 x double>* [[TMP11]], align 8
+; CHECK-NEXT:    store double [[ADD12]], double* [[ARRAYIDX13]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal.ll Mon Feb 19 08:11:44 2018
@@ -730,28 +730,26 @@ define void @foo(float* nocapture readon
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 2
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP0]], 3
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
 ; CHECK-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
 ; CHECK:       for.body16.lr.ph:
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
 ; CHECK:       for.cond.cleanup15:
-; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
 ; CHECK-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
@@ -760,36 +758,26 @@ define void @foo(float* nocapture readon
 ; CHECK-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
 ; CHECK-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.body16:
+; CHECK-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
 ; CHECK-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul fast <2 x float> <float 0x3FF19999A0000000, float 0xBFF3333340000000>, [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]]
+; CHECK-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; CHECK-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
 ; CHECK-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP11]]
-; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float undef, undef
-; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
-; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP21]] = fadd fast float [[TMP20]], [[MUL20]]
-; CHECK-NEXT:    [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
+; CHECK-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; CHECK-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; CHECK-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP25]] = extractelement <4 x float> [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP26]] = insertelement <4 x float> [[TMP24]], float [[TMP25]], i32 3
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
 ;
 ; STORE-LABEL: @foo(
@@ -802,28 +790,26 @@ define void @foo(float* nocapture readon
 ; STORE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
 ; STORE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
 ; STORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; STORE-NEXT:    [[TMP1:%.*]] = or i64 [[TMP0]], 1
-; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
-; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 2
-; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; STORE-NEXT:    [[TMP3:%.*]] = or i64 [[TMP0]], 3
-; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
-; STORE-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; STORE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; STORE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
-; STORE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
-; STORE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
-; STORE-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; STORE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; STORE-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; STORE-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; STORE-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; STORE-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
 ; STORE-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
 ; STORE:       for.body16.lr.ph:
 ; STORE-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; STORE-NEXT:    [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4
+; STORE-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
 ; STORE-NEXT:    br label [[FOR_BODY16:%.*]]
 ; STORE:       for.cond.cleanup15:
-; STORE-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
 ; STORE-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
 ; STORE-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
 ; STORE-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
@@ -832,36 +818,26 @@ define void @foo(float* nocapture readon
 ; STORE-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
 ; STORE-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; STORE:       for.body16:
+; STORE-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
 ; STORE-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0
-; STORE-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; STORE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0
-; STORE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1
-; STORE-NEXT:    [[TMP16:%.*]] = fmul fast <2 x float> <float 0x3FF19999A0000000, float 0xBFF3333340000000>, [[TMP15]]
-; STORE-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; STORE-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; STORE-NEXT:    [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]]
-; STORE-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]]
+; STORE-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; STORE-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; STORE-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; STORE-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
 ; STORE-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; STORE-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP11]]
-; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float undef, undef
-; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
-; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
-; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[RDX_SHUF]]
-; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; STORE-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; STORE-NEXT:    [[TMP21]] = fadd fast float [[TMP20]], [[MUL20]]
-; STORE-NEXT:    [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
+; STORE-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; STORE-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; STORE-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; STORE-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; STORE-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
 ; STORE-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
-; STORE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
-; STORE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP12]], i32 1
-; STORE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP21]], i32 2
-; STORE-NEXT:    [[TMP25]] = extractelement <4 x float> [[TMP11]], i32 2
-; STORE-NEXT:    [[TMP26]] = insertelement <4 x float> [[TMP24]], float [[TMP25]], i32 3
 ; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
 ;
 entry:

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll Mon Feb 19 08:11:44 2018
@@ -9,40 +9,33 @@ define  void @foo (%struct.complex* %A,
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 256, 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP25:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP24:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP6]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> undef, float [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x float> [[TMP17]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <2 x float> [[TMP13]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP13]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x float> [[TMP21]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP24]] = fadd <2 x float> [[TMP2]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18]] = fadd float [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19]] = fadd float [[TMP2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
-; CHECK-NEXT:    store float [[TMP28]], float* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
-; CHECK-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store float [[TMP18]], float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
+; CHECK-NEXT:    store float [[TMP19]], float* [[TMP23]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll?rev=325515&r1=325514&r2=325515&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll Mon Feb 19 08:11:44 2018
@@ -64,17 +64,15 @@ define void @test_volatile_load(double*
 ; CHECK-LABEL: @test_volatile_load(
 ; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
 ; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
 ; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
 ; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[I0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[I1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %i0 = load volatile double, double* %a, align 8




More information about the llvm-commits mailing list